Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OPS: make copyright check less chatty #235

Merged
merged 4 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/gradle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
if: github.event_name == 'pull_request'
run: |
git fetch --depth=1 origin ${{github.base_ref}}
scripts/check-copyright.sh origin/${{github.base_ref}}
scripts/check-copyright.sh -- origin/${{github.base_ref}}
- name: Build with Gradle
run: ./gradlew build
- name: Upload test reports
Expand Down
253 changes: 155 additions & 98 deletions scripts/check-copyright.sh
habiblawal1 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -16,103 +16,160 @@
#
# SPDX-License-Identifier: Apache-2.0

# Stop on first unexpected error
set -e

die() {
echo "$@" >&2
exit 1
}

# Check that git is a command
command -v git > /dev/null 2>&1 || die "Can not find 'git' command."

# Check that base ref has been specified
[ -n "$1" ] || die "Missing required first parameter: git-base-ref"
BASE="$1"

# Check it is a valid ref
git rev-parse --quiet --verify "$BASE" > /dev/null || die "Specified git base ref '$BASE' is not a valid ref in this repository."

# git uses the following one-character change status codes
# A - Added
# C - Copied
# D - Deleted
# M - Modified
# R - Renamed

# Keep track of how many files failed the copyright check so we can find them all and return 0 if there were none
FAILED=0

# Copy stdout to another file descriptor for logging
exec 3>&1
log() { echo>&3 "$@"; }

# Look for unsupported changes with Broken (B), changed (T), Unmerged (U), or Unknown (X) status
BAD_FILES="$(git diff --name-status --diff-filter=BTUX "$BASE")"

[ -z "$BAD_FILES" ] || {
echo "‼️ This script ($0) may need fixing to deal with more types of change." >&2
echo "$BAD_FILES" | sed 's/^/🤯 Unsupported change type: /'
FAILED=$(( FAILED + $(echo "$BAD_FILES"|wc -l) ))
}

# Print deleted files, just for completeness
git diff --name-only --diff-filter=D "$BASE" | sed 's/^/🫥 Ignoring deleted file: /'


# Function to print each file from stdin to stdout unless it has good copyright
badCopyrightFilter() {
while read filePath; do
[ -f "$filePath" ] || die "Cannot check copyright in non-existent file: '$filePath'"
grep -Eq "SPDX-License-Identifier: Apache-2.0" "$filePath" || {
log "👿 License identifier not found: $filePath"
echo "$filePath"
continue
}
yearModified=`git log -1 --pretty=format:%cd --date=format:%Y -- "$filePath"`
grep -Eq "Copyright $yearModified IBM Corporation and" "$filePath" && log "😅 Copyright OK: $filePath" || {
existingModifiedYear="$(grep -Eo 'Copyright [0-9]{4} IBM Corporation and' "$filePath" | cut -d ' ' -f 2 )"
case "$existingModifiedYear" in
"$yearModified") continue ;;
"") log "🤬 No copyright year in '$filePath': expected '$yearModified'." ;;
*) log "😡 Wrong copyright year in '$filePath': expected '$yearModified' but found '$existingModifiedYear'." ;;
esac
echo "$filePath"
}
done
}

echo "Checking added and modified files..."
FAILED=$((FAILED + $(git diff --name-only --find-copies-harder --diff-filter=AM "$BASE" | badCopyrightFilter | wc -l)))


# Renamed (R) and copied (C) files are more complicated.
# They can report as less than 100% identical even when their contents are the same.
# This is apparently due to metadata changes. Shrug.
# So check whether the contents have changed significantly.

# Define how to compare a file against its origin for significant content changes.
# Succeed if there are differences, and print the filename.
isReallyDifferent() { ! git diff --ignore-all-space --quiet "$1" "$2" 2>/dev/null && echo "$2"; }

# Read status, source, and destination as separate records (lines).
# Check the status is R... or C... (otherwise it was parsed incorrectly).
# Then compare the source and dest for significant differences.
# Lastly, if they were different, check them for copyright.
badCopyrightFilter2() {
while read status && read src && read dst
do
case "$status" in
R*) isReallyDifferent "$BASE:$src" "$dst" || log "🫥 Ignoring renamed file: $src -> $dst" ;;
C*) isReallyDifferent "$BASE:$src" "$dst" || log "🫥 Ignoring copied file: $src -> $dst" ;;
*) die "Unexpected status while parsing git diff output: status='$status' src='$src' dst='$dst'" ;;
# Enforce top-level subshell to avoid leaking environment changes (in case script is sourced)
(
# Stop on first unexpected error
set -e

usage() {
echo "usage:\t$0 [-q|--quiet|-t|--terse|-v|--verbose] git-base-ref"
echo "\t-h,--help\tprint this usage info"
echo "\t-t,--terse\tprint only the failing file paths"
echo "\t-q,--quiet\tsuppress all non-error output"
echo "\t-v,--verbose\tenable verbose output"
}

# Copy stdout and stderr to other file descriptors for logging and error reporting
exec 3>/dev/null 4>&1 5>&1 6>&2
# Define semantic functions to echo or cat messages to the various file descriptors
echocat() { { [ $# -gt 0 ] && echo "$@"; } || cat; }
log() { echocat "$@" >&3; }
inf() { echocat "$@" >&4; }
wrn() { echocat "$@" >&5; }
err() { echocat "$@" >&6; }
# Define a Perl-style "die" function to emit an error message and exit with an error code
die() {
err "$@"
exit 1
}

# Parse script options
while [ $# -gt 0 ]; do
case "$1" in
# -h print a usage message and exits successfully
-h|--help)
usage
exit 0
;;
# -t disable logging and info
# print only last arg to warning
# (this should be just the pathname to make it easy to open in an editor)
-t|--terse)
exec 3>/dev/null 4>/dev/null 5>&1
wrn() { [ $# -le 1 ] || shift $(($#-1)); echocat "$@" >&5; }
shift
;;
# -q disables logging, info, and warning
-q|--quiet)
exec 3>/dev/null 4>/dev/null 5>/dev/null;
shift
;;
# -v enables logging
-v|--verbose)
exec 3>&1 4>&1 5>&1
shift
;;
# -- indicates the explicit end of options, so consume it and exit the loop
--)
shift
break
;;
# any other option-like string is an error
# print error and usage and exit with an error code
-*)
err "$0: unknown option '$1'";
usage | die
;;
# any non-option-like string indicates the end of the options
*) break;;
esac
done | badCopyrightFilter
}

echo "Checking renamed and copied files..."
OUTPUT="$(git diff --name-status --find-copies-harder --diff-filter=CR -z "$BASE" 2>/dev/null | tr '\0' '\n')"
FAILED=$((FAILED + $(echo "$OUTPUT"| badCopyrightFilter2 | wc -l)))
done

exit $FAILED
# Check that git is a command
command -v git > /dev/null 2>&1 || die "Can not find 'git' command."

# Check that base ref has been specified
[ -n "$1" ] || die "Missing required first parameter: git-base-ref"
BASE="$1"

# Check it is a valid ref
git rev-parse --quiet --verify "$BASE" > /dev/null || die "Specified git base ref '$BASE' is not a valid ref in this repository."

# Git uses the following one-character change status codes
# A - Added
# C - Copied
# D - Deleted
# M - Modified
# R - Renamed

# Keep track of how many files failed the copyright check so we can find them all and return 0 if there were none
FAILED=0

# Look for unsupported changes with Broken (B), changed (T), Unmerged (U), or Unknown (X) status
BAD_FILES="$(git diff --name-status --diff-filter=BTUX "$BASE")"

[ -z "$BAD_FILES" ] || {
err "‼️ This script ($0) may need fixing to deal with more types of change."
echo "$BAD_FILES" | sed 's/^/🤯 Unsupported change type: /' | err
FAILED=$(( FAILED + $(echo "$BAD_FILES"|wc -l) ))
}

# Log deleted files
git diff --name-only --diff-filter=D "$BASE" | sed 's/^/🫥 Ignoring deleted file: /' | log

# Function to print each pathname from stdin to stdout unless it has good copyright
badCopyrightFilter() {
while read filePath; do
[ -f "$filePath" ] || die "Cannot check copyright in non-existent file: '$filePath'"
grep -Eq "SPDX-License-Identifier: Apache-2.0" "$filePath" || {
wrn "👿 License identifier not found:" "$filePath"
echo "$filePath"
continue
}
yearModified=`git log -1 --pretty=format:%cd --date=format:%Y -- "$filePath"`
grep -Eq "Copyright $yearModified IBM Corporation and" "$filePath" && inf "😅 Copyright OK: $filePath" || {
existingModifiedYear="$(grep -Eo 'Copyright [0-9]{4} IBM Corporation and' "$filePath" | cut -d ' ' -f 2 )"
case "$existingModifiedYear" in
"$yearModified") continue ;;
"") wrn "🤬 No copyright year (expected '$yearModified'):" "$filePath" ;;
*) wrn "😡 Wrong copyright year (expected '$yearModified' but was '$existingModifiedYear'):" "$filePath" ;;
esac
echo "$filePath"
}
done
}

inf "Checking added and modified files..."
FAILED=$((FAILED + $(git diff --name-only --find-copies-harder --diff-filter=AM "$BASE" | badCopyrightFilter | wc -l)))

# Renamed (R) and copied (C) files are more complicated.
# They can report as less than 100% identical even when their contents are the same.
# This is apparently due to metadata changes. Shrug.
# So check whether the contents have changed significantly.

# Define how to compare a file against its origin for significant content changes.
# Succeed if there are differences, and print the filename.
isReallyDifferent() { ! git diff --ignore-all-space --quiet "$1" "$2" 2>/dev/null && echo "$2"; }

# Read status, source, and destination as separate records (lines).
# Check the status is R... or C... (otherwise it was parsed incorrectly).
# Then compare the source and dest for significant differences.
# Lastly, if they were different, check them for copyright.
badCopyrightFilter2() {
while read status && read src && read dst
do
case "$status" in
R100) log "🫥 Ignoring renamed file: $src -> $dst" ;;
C100) log "🫥 Ignoring copied file: $src -> $dst" ;;
R*) isReallyDifferent "$BASE:$src" "$dst" || log "🫥 Ignoring renamed file: $src -> $dst" ;;
C*) isReallyDifferent "$BASE:$src" "$dst" || log "🫥 Ignoring copied file: $src -> $dst" ;;
*) die "Unexpected status while parsing git diff output: status='$status' src='$src' dst='$dst'" ;;
esac
done | badCopyrightFilter
}

inf "Checking renamed and copied files..."
OUTPUT="$(git diff --name-status --find-copies-harder --diff-filter=CR -z "$BASE" 2>/dev/null | tr '\0' '\n')"
FAILED=$((FAILED + $(echo "$OUTPUT"| badCopyrightFilter2 | wc -l)))
exit $FAILED
)
Loading