diff --git a/backend/alembic/versions/f1a2b3c4d5e6_add_image_and_infobox_counts_to_submissions.py b/backend/alembic/versions/f1a2b3c4d5e6_add_image_and_infobox_counts_to_submissions.py new file mode 100644 index 00000000..a7269922 --- /dev/null +++ b/backend/alembic/versions/f1a2b3c4d5e6_add_image_and_infobox_counts_to_submissions.py @@ -0,0 +1,45 @@ +"""Add image_count and infobox_count columns to submissions table + +Revision ID: f1a2b3c4d5e6 +Revises: e4e56960f418 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy import inspect + + +revision = "f1a2b3c4d5e6" +down_revision = "e4e56960f418" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + conn = op.get_bind() + inspector = inspect(conn) + columns = [col["name"] for col in inspector.get_columns("submissions")] + + if "image_count" not in columns: + op.add_column( + "submissions", + sa.Column("image_count", sa.Integer(), nullable=True), + ) + + if "infobox_count" not in columns: + op.add_column( + "submissions", + sa.Column("infobox_count", sa.Integer(), nullable=True), + ) + + +def downgrade() -> None: + conn = op.get_bind() + inspector = inspect(conn) + columns = [col["name"] for col in inspector.get_columns("submissions")] + + if "infobox_count" in columns: + op.drop_column("submissions", "infobox_count") + + if "image_count" in columns: + op.drop_column("submissions", "image_count") diff --git a/backend/app/models/submission.py b/backend/app/models/submission.py index c520fd21..ff826740 100644 --- a/backend/app/models/submission.py +++ b/backend/app/models/submission.py @@ -70,6 +70,12 @@ class Submission(BaseModel): # Can be negative if article was reduced in size article_expansion_bytes = db.Column(db.Integer, nullable=True) + # Image count + image_count = db.Column(db.Integer, nullable=True) + + # Infobox count + infobox_count = db.Column(db.Integer, nullable=True) + # Template enforcement tracking # True if template was automatically added to the article during submission template_added = db.Column(db.Boolean, nullable=True, default=False) @@ -173,6 +179,8 @@ def __init__( template_added=False, categories_added=None, category_error=None, + image_count=None, + infobox_count=None, ): """ Initialize a new Submission instance @@ -192,6 +200,8 @@ def __init__( template_added: Whether template was automatically added to article (optional) categories_added: List of category names that were automatically added (optional, stored as JSON) category_error: Error message if category attachment failed (optional) + image_count: Number of images in the article (optional) + infobox_count: Number of infoboxes in the article (optional) """ # Set required fields self.user_id = user_id @@ -219,6 +229,8 @@ def __init__( else: self.categories_added = None self.category_error = category_error + self.image_count = image_count + self.infobox_count = infobox_count self.reviewed_by = None self.reviewed_at = None self.review_comment = None @@ -530,6 +542,8 @@ def to_dict(self, include_user_info=False): "article_page_id": self.article_page_id, "article_size_at_start": self.article_size_at_start, "article_expansion_bytes": self.article_expansion_bytes, + "image_count": self.image_count, + "infobox_count": self.infobox_count, "template_added": self.template_added, "categories_added": self.get_categories_added(), "category_error": self.category_error, diff --git a/backend/app/routes/contest_routes.py b/backend/app/routes/contest_routes.py index c5bcaa24..ec434ad3 100644 --- a/backend/app/routes/contest_routes.py +++ b/backend/app/routes/contest_routes.py @@ -31,6 +31,8 @@ check_article_has_category, append_categories_to_article, get_article_reference_count, + get_article_image_count, + get_article_infobox_count, MEDIAWIKI_API_TIMEOUT, ) from app.services.outreach_dashboard import ( @@ -1314,6 +1316,8 @@ def submit_to_contest(contest_id): # pylint: disable=too-many-return-statements article_size_at_start = None article_expansion_bytes = None article_reference_count = None + image_count = None + infobox_count = None # --- Fetch Article Information from MediaWiki API --- # MediaWiki API fetching has deep nesting due to complex error handling @@ -1699,6 +1703,31 @@ def submit_to_contest(contest_id): # pylint: disable=too-many-return-statements pass article_reference_count = None + # --- Fetch Image and Infobox Counts --- + # These richness metrics are stored for future use but are not used in + # validation or scoring yet. + try: + image_count = get_article_image_count(article_link) + except Exception as img_error: # pylint: disable=broad-exception-caught + try: + current_app.logger.warning( + "Failed to fetch image count: %s", str(img_error) + ) + except Exception: # pylint: disable=broad-exception-caught + pass + image_count = None + + try: + infobox_count = get_article_infobox_count(article_link) + except Exception as ibx_error: # pylint: disable=broad-exception-caught + try: + current_app.logger.warning( + "Failed to fetch infobox count: %s", str(ibx_error) + ) + except Exception: # pylint: disable=broad-exception-caught + pass + infobox_count = None + # --- Validate Article Requirements --- # Validate article byte count against contest requirements # This check happens after fetching article information from MediaWiki API @@ -2060,6 +2089,8 @@ def submit_to_contest(contest_id): # pylint: disable=too-many-return-statements template_added=template_added, categories_added=categories_added, category_error=category_error, + image_count=image_count, + infobox_count=infobox_count, ) submission.save() diff --git a/backend/app/utils/__init__.py b/backend/app/utils/__init__.py index e67a9de4..146b97bb 100644 --- a/backend/app/utils/__init__.py +++ b/backend/app/utils/__init__.py @@ -44,6 +44,8 @@ "append_categories_to_article", "get_article_reference_count", "get_mediawiki_user_edit_count", + "get_article_image_count", + "get_article_infobox_count", ] @@ -948,6 +950,63 @@ def _fetch_footnotes_count(api_url: str, page_title: str, headers: dict) -> int: return 0 +def _log_warning(message: str, error: Exception) -> None: + """Best-effort logging helper that uses Flask current_app when available. + + This keeps network helpers free from hard Flask dependencies while still + providing useful diagnostics in a running application. + """ + try: + from flask import current_app + + current_app.logger.warning("%s: %s", message, str(error)) + except Exception: # pylint: disable=broad-exception-caught + # Logging must never break core logic, so ignore any logging failures + pass + + +def get_article_image_count(article_url: str) -> Optional[int]: + """ + The count is approximate and based purely on wikitext patterns; it does + not guarantee that every match results in a rendered image, but it + generally tracks user-added content images. + """ + try: + wikitext = get_article_wikitext(article_url) + if wikitext is None: + return None + + # Match explicit file/image links like [[File:Example.jpg|...]] or + # [[Image:Example.png|...]] in a case-insensitive way. + matches = re.findall(r'\[\[(?:File|Image):', wikitext, flags=re.IGNORECASE) + return len(matches) + + except Exception as error: # pylint: disable=broad-exception-caught + _log_warning("Failed to fetch image count", error) + return None + + +def get_article_infobox_count(article_url: str) -> Optional[int]: + """Count approximate number of infobox templates in article wikitext. + + Detection is done via a simple regex scan for ``{{infobox ...}}`` in the + raw wikitext. This is an approximation and may over-count or under-count + in edge cases (e.g. nested templates, unusual formatting), but is + sufficient for high-level richness metrics. + """ + try: + wikitext = get_article_wikitext(article_url) + if wikitext is None: + return None + + matches = re.findall(r"\{\{\s*infobox\b", wikitext, flags=re.IGNORECASE) + return len(matches) + + except Exception as error: # pylint: disable=broad-exception-caught + _log_warning("Failed to fetch infobox count", error) + return None + + def get_article_reference_count(article_url: str) -> Optional[int]: """ Get the total number of references in a MediaWiki article.