Skip to content

Commit

Permalink
Merge pull request #156 from nexB/add_non_reachable_urls
Browse files Browse the repository at this point in the history
Add timeout for fetching URL
  • Loading branch information
JonoYang committed Aug 3, 2023
2 parents 4f25883 + 5771cb2 commit 5258a92
Showing 1 changed file with 22 additions and 4 deletions.
26 changes: 22 additions & 4 deletions packagedb/find_source_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ class URLDataReturnType(enum.Enum):
text = "text" # This is the text of the response


non_reachable_urls = []
non_reachable_urls = [
]
CACHE = {
# url: data
}
Expand Down Expand Up @@ -73,7 +74,7 @@ def get_data_from_response(response, data_type=URLDataReturnType.text):
def get_data_from_url(
url,
data_type=URLDataReturnType.text,
timeout=None,
timeout=10,
):
"""
Take a ``url`` as input and return the data from the URL
Expand All @@ -83,6 +84,23 @@ def get_data_from_url(
try:
if not url:
return
if url.startswith("https://github.com/assets"):
return
not_supported_extensions = [
".pdf",
".zip",
".woff2",
".jar",
".js",
".png",
".css",
".svg",
".jpg",
".tgz",
]
for extension in not_supported_extensions:
if url.endswith(extension):
return
if url in non_reachable_urls:
return
if url in CACHE:
Expand Down Expand Up @@ -372,11 +390,11 @@ def get_git_repo_urls(urls):
if url and any(url_hint in url for url_hint in url_hints):
yield url
else:
if url.startswith("git+"):
if url and url.startswith("git+"):
_, _, url = url.partition("git+")
try:
url = get_data_from_url(
url=url, data_type=URLDataReturnType.url, timeout=(10, None)
url=url, data_type=URLDataReturnType.url
)
if not url:
continue
Expand Down

0 comments on commit 5258a92

Please sign in to comment.