Skip to content

Commit d8410c9

Browse files
committed
Minimize network access
This implements caching mechanism with 15-minute expiry for upstream commits. It add rate limit detection and exponential back-off for GitHub API calls, prioritizing git ls-remote over web scraping to reduce network overhead. Change-Id: I40e3bab73a0351ebcbeecd56d41ca570b54d415b
1 parent b979dab commit d8410c9

File tree

1 file changed

+187
-47
lines changed

1 file changed

+187
-47
lines changed

scripts/check-repo.sh

Lines changed: 187 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,33 @@
11
#!/usr/bin/env bash
22

3+
# Parse command line arguments
4+
FORCE_REFRESH=false
5+
QUIET_MODE=false
6+
while [[ $# -gt 0 ]]; do
7+
case "$1" in
8+
--force-refresh|-f)
9+
FORCE_REFRESH=true
10+
shift
11+
;;
12+
--quiet|-q)
13+
QUIET_MODE=true
14+
shift
15+
;;
16+
--help|-h)
17+
echo "Usage: $0 [--force-refresh|-f] [--quiet|-q] [--help|-h]"
18+
echo " --force-refresh, -f Force refresh of cached data"
19+
echo " --quiet, -q Suppress progress and informational output"
20+
echo " --help, -h Show this help message"
21+
exit 0
22+
;;
23+
*)
24+
echo "Unknown option: $1"
25+
echo "Use --help for usage information"
26+
exit 1
27+
;;
28+
esac
29+
done
30+
331
# Ensure that the common script exists and is readable, then verify it has no
432
# syntax errors and defines the required function.
533
common_script="$(dirname "$0")/common.sh"
@@ -12,6 +40,22 @@ set_colors
1240

1341
check_github_actions
1442

43+
# Override progress function if in quiet mode
44+
if [ "$QUIET_MODE" = true ]; then
45+
progress() {
46+
# Do nothing in quiet mode
47+
:
48+
}
49+
fi
50+
51+
# Cache configuration
52+
CACHE_DIR="$HOME/.cache/lab0-c"
53+
CACHE_FILE="$CACHE_DIR/upstream_commit"
54+
CACHE_EXPIRY=900 # Cache for 15 minutes (in seconds)
55+
56+
# Create cache directory if it doesn't exist
57+
mkdir -p "$CACHE_DIR"
58+
1559
TOTAL_STEPS=6
1660
CURRENT_STEP=0
1761

@@ -46,14 +90,65 @@ fi
4690
((CURRENT_STEP++))
4791
progress "$CURRENT_STEP" "$TOTAL_STEPS"
4892

49-
# Generate a random integer in [0..999].
50-
random_ms=$((RANDOM % 1000))
93+
# Check if cache exists and is still valid
94+
use_cache=false
95+
if [ "$FORCE_REFRESH" = true ]; then
96+
if [ "$QUIET_MODE" = false ]; then
97+
printf "\r%80s\r" " "
98+
echo "Force refresh requested. Clearing cache..."
99+
fi
100+
rm -f "$CACHE_FILE" "$RATE_LIMIT_FILE"
101+
elif [ -f "$CACHE_FILE" ]; then
102+
cache_age=$(($(date +%s) - $(stat -f %m "$CACHE_FILE" 2>/dev/null || stat -c %Y "$CACHE_FILE" 2>/dev/null || echo 0)))
103+
if [ "$cache_age" -lt "$CACHE_EXPIRY" ]; then
104+
upstream_hash=$(cat "$CACHE_FILE")
105+
if [ -n "$upstream_hash" ]; then
106+
use_cache=true
107+
if [ "$QUIET_MODE" = false ]; then
108+
printf "\r%80s\r" " "
109+
echo "Using cached upstream commit (${cache_age}s old, expires in $((CACHE_EXPIRY - cache_age))s)"
110+
fi
111+
fi
112+
else
113+
if [ "$QUIET_MODE" = false ]; then
114+
printf "\r%80s\r" " "
115+
echo "Cache expired (${cache_age}s old). Refreshing..."
116+
fi
117+
fi
118+
fi
119+
120+
# Only sleep and fetch if not using cache
121+
if [ "$use_cache" = false ]; then
122+
# Generate a random integer in [0..999].
123+
random_ms=$((RANDOM % 1000))
124+
125+
# Add exponential backoff if we've been rate limited recently
126+
RATE_LIMIT_FILE="$CACHE_DIR/rate_limited"
127+
if [ -f "$RATE_LIMIT_FILE" ]; then
128+
last_limited=$(($(date +%s) - $(stat -f %m "$RATE_LIMIT_FILE" 2>/dev/null || stat -c %Y "$RATE_LIMIT_FILE" 2>/dev/null || echo 0)))
129+
if [ "$last_limited" -lt 300 ]; then # If rate limited in last 5 minutes
130+
random_ms=$((random_ms + 2000)) # Add 2 seconds
131+
if [ "$QUIET_MODE" = false ]; then
132+
printf "\r%80s\r" " "
133+
echo "Rate limit detected. Adding delay..."
134+
fi
135+
fi
136+
fi
137+
138+
# Convert that to a decimal of the form 0.xxx so that 'sleep' interprets it as seconds.
139+
# e.g., if random_ms is 5, we convert that to 0.005 (i.e. 5 ms).
140+
# Use printf for portability (bc might not be installed)
141+
sleep_time="0.$(printf "%03d" "$((random_ms % 1000))")"
51142

52-
# Convert that to a decimal of the form 0.xxx so that 'sleep' interprets it as seconds.
53-
# e.g., if random_ms is 5, we convert that to 0.005 (i.e. 5 ms).
54-
sleep_time="0.$(printf "%03d" "$random_ms")"
143+
# For delays > 1 second, handle separately
144+
if [ "$random_ms" -ge 1000 ]; then
145+
sleep_seconds=$((random_ms / 1000))
146+
sleep_ms=$((random_ms % 1000))
147+
sleep_time="${sleep_seconds}.$(printf "%03d" "$sleep_ms")"
148+
fi
55149

56-
sleep "$sleep_time"
150+
sleep "$sleep_time"
151+
fi
57152

58153
# 2. Fetch latest commit from GitHub
59154
((CURRENT_STEP++))
@@ -62,53 +157,95 @@ progress "$CURRENT_STEP" "$TOTAL_STEPS"
62157
REPO_OWNER=$(git config -l | grep -w remote.origin.url | sed -E 's%^.*github.com[/:]([^/]+)/lab0-c.*%\1%')
63158
REPO_NAME="lab0-c"
64159

65-
repo_html=$(curl -s "https://github.com/${REPO_OWNER}/${REPO_NAME}")
66-
67-
# Extract the default branch name from data-default-branch="..."
68-
DEFAULT_BRANCH=$(echo "$repo_html" | sed -nE "s#.*${REPO_OWNER}/${REPO_NAME}/blob/([^/]+)/LICENSE.*#\1#p" | head -n 1)
160+
# Only fetch from network if not using cache
161+
if [ "$use_cache" = false ]; then
162+
# First try using git ls-remote (much faster and less likely to be rate limited)
163+
if [ "$QUIET_MODE" = false ]; then
164+
printf "\r%80s\r" " "
165+
echo "Checking upstream repository..."
166+
fi
167+
upstream_hash=$(git ls-remote --heads origin master 2>/dev/null | cut -f1)
69168

70-
if [ "$DEFAULT_BRANCH" != "master" ]; then
71-
echo "$DEFAULT_BRANCH"
72-
throw "The default branch for $REPO_OWNER/$REPO_NAME is not 'master'."
73-
fi
169+
# If git ls-remote fails or returns empty, fall back to web scraping
170+
if [ -z "$upstream_hash" ]; then
171+
if [ "$QUIET_MODE" = false ]; then
172+
printf "\r%80s\r" " "
173+
echo "git ls-remote failed. Falling back to web scraping..."
174+
fi
74175

75-
# Construct the URL to the commits page for the default branch
76-
COMMITS_URL="https://github.com/${REPO_OWNER}/${REPO_NAME}/commits/${DEFAULT_BRANCH}"
77-
78-
temp_file=$(mktemp)
79-
curl -sSL -o "$temp_file" "$COMMITS_URL"
80-
81-
# general grep pattern that finds commit links
82-
upstream_hash=$(
83-
sed -nE 's/.*href="[^"]*\/commit\/([0-9a-f]{40}).*/\1/p' "$temp_file" | head -n 1
84-
)
85-
86-
rm -f "$temp_file"
87-
88-
# If HTML parsing fails, fallback to using GitHub REST API
89-
if [ -z "$upstream_hash" ]; then
90-
API_URL="https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/commits"
91-
92-
# Try to use cached GitHub credentials from GitHub CLI
93-
# https://docs.github.com/en/get-started/git-basics/caching-your-github-credentials-in-git
94-
if command -v gh >/dev/null 2>&1; then
95-
TOKEN=$(gh auth token 2>/dev/null)
96-
if [ -n "$TOKEN" ]; then
97-
response=$(curl -sSL -H "Authorization: token $TOKEN" "$API_URL")
176+
# Add User-Agent header to avoid being blocked
177+
USER_AGENT="Mozilla/5.0 (compatible; lab0-c-checker/1.0)"
178+
179+
# Try with rate limit detection
180+
repo_html=$(curl -s -w "\n%{http_code}" -H "User-Agent: $USER_AGENT" "https://github.com/${REPO_OWNER}/${REPO_NAME}")
181+
http_code=$(echo "$repo_html" | tail -n 1)
182+
repo_html=$(echo "$repo_html" | sed '$d')
183+
184+
# Check for rate limiting (HTTP 429 or 403)
185+
if [ "$http_code" = "429" ] || [ "$http_code" = "403" ]; then
186+
touch "$RATE_LIMIT_FILE"
187+
if [ "$QUIET_MODE" = false ]; then
188+
printf "\r%80s\r" " "
189+
echo "GitHub rate limit detected (HTTP $http_code). Using fallback..."
190+
fi
191+
192+
# Try to use last known good commit from git log
193+
upstream_hash=$(git ls-remote origin master 2>/dev/null | cut -f1)
194+
if [ -z "$upstream_hash" ]; then
195+
throw "Rate limited by GitHub and no fallback available. Please try again later."
196+
fi
197+
else
198+
# Extract the default branch name from data-default-branch="..."
199+
DEFAULT_BRANCH=$(echo "$repo_html" | sed -nE "s#.*${REPO_OWNER}/${REPO_NAME}/blob/([^/]+)/LICENSE.*#\1#p" | head -n 1)
200+
201+
if [ "$DEFAULT_BRANCH" != "master" ]; then
202+
echo "$DEFAULT_BRANCH"
203+
throw "The default branch for $REPO_OWNER/$REPO_NAME is not 'master'."
204+
fi
205+
206+
# Construct the URL to the commits page for the default branch
207+
COMMITS_URL="https://github.com/${REPO_OWNER}/${REPO_NAME}/commits/${DEFAULT_BRANCH}"
208+
209+
temp_file=$(mktemp)
210+
curl -sSL -H "User-Agent: $USER_AGENT" -o "$temp_file" "$COMMITS_URL"
211+
212+
# general grep pattern that finds commit links
213+
upstream_hash=$(
214+
sed -nE 's/.*href="[^"]*\/commit\/([0-9a-f]{40}).*/\1/p' "$temp_file" | head -n 1
215+
)
216+
217+
rm -f "$temp_file"
218+
219+
# If HTML parsing fails, fallback to using GitHub REST API
220+
if [ -z "$upstream_hash" ]; then
221+
API_URL="https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/commits"
222+
223+
# Try to use cached GitHub credentials from GitHub CLI
224+
# https://docs.github.com/en/get-started/git-basics/caching-your-github-credentials-in-git
225+
if command -v gh >/dev/null 2>&1; then
226+
TOKEN=$(gh auth token 2>/dev/null)
227+
if [ -n "$TOKEN" ]; then
228+
response=$(curl -sSL -H "Authorization: token $TOKEN" -H "User-Agent: $USER_AGENT" "$API_URL")
229+
fi
230+
fi
231+
232+
# If response is empty (i.e. token not available or failed), use unauthenticated request.
233+
if [ -z "$response" ]; then
234+
response=$(curl -sSL -H "User-Agent: $USER_AGENT" "$API_URL")
235+
fi
236+
237+
# Extract the latest commit SHA from the JSON response
238+
upstream_hash=$(echo "$response" | grep -m 1 '"sha":' | sed -E 's/.*"sha": "([^"]+)".*/\1/')
239+
fi
98240
fi
99241
fi
100242

101-
# If response is empty (i.e. token not available or failed), use unauthenticated request.
102-
if [ -z "$response" ]; then
103-
response=$(curl -sSL "$API_URL")
243+
if [ -z "$upstream_hash" ]; then
244+
throw "Failed to retrieve upstream commit hash from GitHub.\n"
104245
fi
105246

106-
# Extract the latest commit SHA from the JSON response
107-
upstream_hash=$(echo "$response" | grep -m 1 '"sha":' | sed -E 's/.*"sha": "([^"]+)".*/\1/')
108-
fi
109-
110-
if [ -z "$upstream_hash" ]; then
111-
throw "Failed to retrieve upstream commit hash from GitHub.\n"
247+
# Cache the result
248+
echo "$upstream_hash" > "$CACHE_FILE"
112249
fi
113250

114251
# 3. Check local repository awareness
@@ -167,6 +304,9 @@ if [ $failed -ne 0 ]; then
167304
exit 1
168305
fi
169306

170-
echo "Fingerprint: $(make_random_string 24 "$REPO_OWNER")"
307+
if [ "$QUIET_MODE" = false ]; then
308+
printf "\r%80s\r" " "
309+
echo "Fingerprint: $(make_random_string 24 "$REPO_OWNER")"
310+
fi
171311

172312
exit 0

0 commit comments

Comments
 (0)