Skip to content

Commit

Permalink
Merge branch 'main' into 2489-wait-plausible-service
Browse files Browse the repository at this point in the history
  • Loading branch information
dhruvkb authored Oct 13, 2023
2 parents 0f5eb2f + b1e7e17 commit aa10454
Show file tree
Hide file tree
Showing 416 changed files with 660 additions and 24,867 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/ci_cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,11 @@ jobs:
- name: Build image `${{ matrix.image }}`
uses: docker/build-push-action@v5
with:
context: ${{ matrix.context || matrix.image }}
context: ${{ matrix.context }}
target: ${{ matrix.target }}
push: false
tags: openverse-${{ matrix.image }}
file: ${{ matrix.file }}
cache-from: type=gha,scope=${{ matrix.image }}
cache-to: type=gha,scope=${{ matrix.image }}
outputs: type=docker,dest=/tmp/${{ matrix.image }}.tar
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/pr_label_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
- get_label_groups
steps:
- name: Check aspect label
uses: docker://agilepathway/pull-request-label-checker:v1.5.3
uses: docker://agilepathway/pull-request-label-checker:v1.5.9
with:
one_of: ${{ needs.get_label_groups.outputs.aspect }}
repo_token: ${{ secrets.GITHUB_TOKEN }}
Expand All @@ -55,7 +55,7 @@ jobs:
- get_label_groups
steps:
- name: Check goal label
uses: docker://agilepathway/pull-request-label-checker:v1.5.3
uses: docker://agilepathway/pull-request-label-checker:v1.5.9
with:
one_of: ${{ needs.get_label_groups.outputs.goal }}
repo_token: ${{ secrets.GITHUB_TOKEN }}
Expand All @@ -68,7 +68,7 @@ jobs:
- get_label_groups
steps:
- name: Check priority label
uses: docker://agilepathway/pull-request-label-checker:v1.5.3
uses: docker://agilepathway/pull-request-label-checker:v1.5.9
with:
one_of: ${{ needs.get_label_groups.outputs.priority }}
repo_token: ${{ secrets.GITHUB_TOKEN }}
Expand All @@ -81,7 +81,7 @@ jobs:
- get_label_groups
steps:
- name: Check stack label
uses: docker://agilepathway/pull-request-label-checker:v1.5.3
uses: docker://agilepathway/pull-request-label-checker:v1.5.9
with:
any_of: ${{ needs.get_label_groups.outputs.stack }}
repo_token: ${{ secrets.GITHUB_TOKEN }}
2 changes: 1 addition & 1 deletion api/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ COPY --from=awf /usr/local/bin/audiowaveform /usr/local/bin
# - libexempi8: required for watermarking
# - Create directory for dumping API logs
RUN apt-get update \
&& apt-get install -y curl libexempi8 postgresql-client \
&& apt-get install -y curl libexempi8 postgresql-client libc-bin=2.36-9+deb12u3 libc6=2.36-9+deb12u3 \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir -p /var/log/openverse_api/openverse_api.log

Expand Down
88 changes: 54 additions & 34 deletions api/api/controllers/search_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from django.conf import settings
from django.core.cache import cache

from elasticsearch.exceptions import NotFoundError, RequestError
from elasticsearch.exceptions import BadRequestError, NotFoundError
from elasticsearch_dsl import Q, Search
from elasticsearch_dsl.query import EMPTY_QUERY, MoreLikeThis, Query
from elasticsearch_dsl.query import EMPTY_QUERY, Match, Query, SimpleQueryString, Term
from elasticsearch_dsl.response import Hit, Response

import api.models as models
Expand Down Expand Up @@ -283,7 +283,8 @@ def _exclude_filtered(s: Search):
key=filter_cache_key, timeout=FILTER_CACHE_TIMEOUT, value=filtered_providers
)
to_exclude = [f["provider_identifier"] for f in filtered_providers]
s = s.exclude("terms", provider=to_exclude)
if to_exclude:
s = s.exclude("terms", provider=to_exclude)
return s


Expand Down Expand Up @@ -447,7 +448,7 @@ def search(

if settings.VERBOSE_ES_RESPONSE:
log.info(pprint.pprint(search_response.to_dict()))
except (RequestError, NotFoundError) as e:
except (BadRequestError, NotFoundError) as e:
raise ValueError(e)

results = _post_process_results(
Expand Down Expand Up @@ -495,43 +496,62 @@ def search(
return results, page_count, result_count, search_context.asdict()


def related_media(uuid, index, filter_dead):
"""Given a UUID, find related search results."""
def related_media(uuid: str, index: str, filter_dead: bool) -> list[Hit]:
"""
Given a UUID, finds 10 related search results based on title and tags.
search_client = Search(index=index)
Uses Match query for title or SimpleQueryString for tags.
If the item has no title and no tags, returns items by the same creator.
If the item has no title, no tags or no creator, returns empty list.
# Convert UUID to sequential ID.
item = search_client
item = item.query("match", identifier=uuid)
_id = item.execute().hits[0].id
:param uuid: The UUID of the item to find related results for.
:param index: The Elasticsearch index to search (e.g. 'image')
:param filter_dead: Whether dead links should be removed.
:return: List of related results.
"""

s = search_client
s = s.query(
MoreLikeThis(
fields=["tags.name", "title", "creator"],
like={"_index": index, "_id": _id},
min_term_freq=1,
max_query_terms=50,
)
)
# Never show mature content in recommendations.
s = s.exclude("term", mature=True)
# Search the default index for the item itself as it might be sensitive.
item_search = Search(index=index)
# TODO: remove `__keyword` after
# https://github.com/WordPress/openverse/pull/3143 is merged.
item_hit = item_search.query(Term(identifier__keyword=uuid)).execute().hits[0]

# Match related using title.
title = getattr(item_hit, "title", None)
tags = getattr(item_hit, "tags", None)
creator = getattr(item_hit, "creator", None)

if not title and not tags:
if not creator:
return []
related_query = Term(creator__keyword=creator)
else:
related_query = None if not title else Match(title=title)

# Match related using tags, if the item has any.
if tags:
# Only use the first 10 tags
tags = " | ".join([tag.name for tag in tags[:10]])
tags_query = SimpleQueryString(fields=["tags.name"], query=tags)
related_query = related_query | tags_query if related_query else tags_query

# Search the filtered index for related items.
s = Search(index=f"{index}-filtered")

# Exclude the current item and mature content.
# TODO: remove `__keyword` after
# https://github.com/WordPress/openverse/pull/3143 is merged.
s = s.query(related_query & ~Term(identifier__keyword=uuid) & ~Term(mature=True))
# Exclude the dynamically disabled sources.
s = _exclude_filtered(s)
page_size = 10
page = 1

page, page_size = 1, 10
start, end = _get_query_slice(s, page_size, page, filter_dead)
s = s[start:end]

response = s.execute()
results = _post_process_results(s, start, end, page_size, response, filter_dead)

result_count, _ = _get_result_and_page_count(response, results, page_size, page)

if not results:
results = []

result_ids = [result.identifier for result in results]
search_context = SearchContext.build(result_ids, index)
return results, result_count, search_context.asdict()
return results or []


def get_sources(index):
Expand Down Expand Up @@ -579,7 +599,7 @@ def _get_result_and_page_count(
response_obj: Response, results: list[Hit] | None, page_size: int, page: int
) -> tuple[int, int]:
"""
Adjust related page count because ES disallows deep pagination of ranked queries.
Adjust page count because ES disallows deep pagination of ranked queries.
:param response_obj: The original Elasticsearch response object.
:param results: The list of filtered result Hits.
Expand Down
8 changes: 8 additions & 0 deletions api/api/templates/robots.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Block API endpoints
User-agent: *
Disallow: /v1/images/
Disallow: /v1/audio/
Disallow: /v1/auth/

User-agent: GPTBot
Disallow: /
7 changes: 4 additions & 3 deletions api/api/views/media_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,22 +158,23 @@ def stats(self, *_, **__):
@action(detail=True)
def related(self, request, identifier=None, *_, **__):
try:
results, num_results, search_context = search_controller.related_media(
results = search_controller.related_media(
uuid=identifier,
index=self.default_index,
filter_dead=True,
)
self.paginator.result_count = num_results
self.paginator.page_count = 1
# `page_size` refers to the maximum number of related images to return.
self.paginator.page_size = 10
# `result_count` is hard-coded and is equal to the page size.
self.paginator.result_count = 10
except ValueError as e:
raise APIException(getattr(e, "message", str(e)))
# If there are no hits in the search controller
except IndexError:
raise APIException("Could not find items.", 404)

serializer_context = search_context | self.get_serializer_context()
serializer_context = self.get_serializer_context()

serializer = self.get_serializer(results, many=True, context=serializer_context)
return self.get_paginated_response(serializer.data)
Expand Down
9 changes: 8 additions & 1 deletion api/conf/urls/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from django.conf.urls.static import static
from django.contrib import admin
from django.urls import include, path
from django.views.generic import RedirectView
from django.views.generic import RedirectView, TemplateView
from rest_framework.routers import SimpleRouter

from api.views.audio_views import AudioViewSet
Expand All @@ -35,6 +35,13 @@
path("admin/", admin.site.urls),
path("healthcheck/", HealthCheck.as_view(), name="health"),
path("v1/", include(versioned_paths)),
path(
"robots.txt/",
TemplateView.as_view(
template_name="robots.txt",
content_type="text/plain",
),
),
]

if settings.ENVIRONMENT == "local":
Expand Down
17 changes: 15 additions & 2 deletions automations/python/workflows/set_matrix_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ def ser_set(x):
"api": {"image": "api", "target": "api"},
"api_nginx": {"image": "api_nginx", "context": "api", "target": "nginx"},
"frontend": {"image": "frontend", "target": "app", "build-contexts": "repo_root=."},
"frontend_nginx": {
"image": "frontend_nginx",
"context": "frontend",
"file": "frontend/Dockerfile.nginx",
"target": "nginx",
},
}

if "ci_cd" in changes:
Expand All @@ -46,11 +52,18 @@ def ser_set(x):
build_matrix["image"] |= {"upstream_db", "ingestion_server", "api", "api_nginx"}
publish_matrix["image"] |= {"api", "api_nginx"}
if "frontend" in changes:
build_matrix["image"].add("frontend")
publish_matrix["image"].add("frontend")
build_matrix["image"] |= {"frontend", "frontend_nginx"}
publish_matrix["image"] |= {"frontend", "frontend_nginx"}

build_matrix["include"] = [includes[item] for item in build_matrix["image"]]

for item in build_matrix["include"]:
if "context" not in item:
item["context"] = item["image"]

if "file" not in item:
item["file"] = f"{item['context']}/Dockerfile"

do_build = "true" if len(build_matrix["image"]) else "false"
do_publish = "true" if len(publish_matrix["image"]) else "false"
build_matrix = json.dumps(build_matrix, default=ser_set)
Expand Down
2 changes: 2 additions & 0 deletions catalog/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ ENV AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER=s3://openverse-airflow-logs
USER root
RUN apt-get update \
&& apt-get -yqq install \
libc-bin=2.31-13+deb11u7 \
libc6=2.31-13+deb11u7 \
build-essential \
libpq-dev \
libffi-dev \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,12 @@ def get_min_required_approvals(gh: GitHubAPI, pr: dict) -> int:
else:
raise e

if "required_pull_request_reviews" not in branch_protection_rules:
# This can happen in the rare case where a PR is multiple branches deep,
# e.g. it depends on a branch which depends on a branch which depends on main.
# In that case, default to the rules for `main` as a safe default.
branch_protection_rules = get_branch_protection(gh, repo, "main")

return branch_protection_rules["required_pull_request_reviews"][
"required_approving_review_count"
]
Expand Down
2 changes: 1 addition & 1 deletion catalog/requirements_prod.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Note: Unpinned packages have their versions determined by the Airflow constraints file

apache-airflow[amazon,postgres,http]==2.7.0
apache-airflow[amazon,postgres,http]==2.7.1
lxml
psycopg2-binary
requests-file==1.5.1
Expand Down
34 changes: 34 additions & 0 deletions catalog/tests/dags/maintenance/test_pr_review_reminders.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,3 +548,37 @@ def test_ignores_created_at_and_pings_if_urgent_ready_for_review_event_exists(
post_reminders("not_set", dry_run=False)

assert pull["number"] in github["posted_comments"]


def test_falls_back_to_main_on_multiple_branch_levels(
github,
):
# No need to parametrize this, only need to test the one case
# Make branch protection rules for a branch one-off main
github["branch_protection"]["openverse"]["non_main"] = {}
past_due_pull = make_pull(Urgency.LOW, old=True, base_branch="not_main")
past_due_pull["requested_reviewers"] = [
make_requested_reviewer(f"reviewer-due-{i}") for i in range(2)
]

min_required_approvals = 4

# Always use `main` to exercise fallback for non-main branches
_setup_branch_protection_for_branch(
github,
repo="openverse",
branch="main",
min_required_approvals=min_required_approvals,
)

github["pulls"] += [past_due_pull]
github["pull_reviews"][past_due_pull["number"]] = [
make_review("APPROVED"),
] * min_required_approvals
github["events"][past_due_pull["number"]] = make_urgent_events(
Urgency.LOW, ["review_requested"]
)

post_reminders("not_set", dry_run=False)

assert past_due_pull["number"] not in github["posted_comments"]
17 changes: 17 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,23 @@ services:
- ./docker/nginx/templates:/etc/nginx/templates
- ./docker/nginx/certs:/etc/nginx/certs

frontend_nginx:
profiles:
- frontend
build:
context: ./frontend/
dockerfile: Dockerfile.nginx
target: nginx
args: # Automatically inferred from env vars, unless specified
- SEMANTIC_VERSION=${SEMANTIC_VERSION:-v1.0.0}
- FRONTEND_NODE_VERSION
- FRONTEND_PNPM_VERSION
ports:
- "50290:8080"
environment:
OPENVERSE_NGINX_UPSTREAM_URL: ${HOST_NETWORK_ADDRESS}:8443
OPENVERSE_NGINX_PLAUSIBLE_EVENT_URL: http://plausible:8000/api/event

volumes:
api-postgres:
catalog-postgres:
Expand Down
2 changes: 2 additions & 0 deletions docker/upstream_db/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ RUN apt-get update \
&& apt-get -yqq install \
python3-boto3 \
postgresql-plpython3-13 \
libc-bin=2.31-13+deb11u7 \
libc6=2.31-13+deb11u7 \
python3-pip \
libpq-dev \
&& apt-get autoremove -y \
Expand Down
Loading

0 comments on commit aa10454

Please sign in to comment.