-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcalculate_csv_coverage_stats.py
More file actions
137 lines (105 loc) · 3.6 KB
/
calculate_csv_coverage_stats.py
File metadata and controls
137 lines (105 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
"""Calculate coverage statistics from GraphQL schema coverage CSV files."""
from pathlib import Path
from typing import Tuple, List
import pandas as pd
import sys
def process_csv_file(csv_path: Path) -> Tuple[int, int, float]:
"""
Process a single CSV file and calculate coverage statistics.
Args:
csv_path: Path to the CSV file
Returns:
Tuple of (numerator, denominator, fraction)
numerator: Number of rows with Covered=True
denominator: Total number of data rows (excluding header)
fraction: numerator / denominator
Raises:
ValueError: If the CSV file doesn't have the expected structure
"""
df = pd.read_csv(csv_path)
# Check if 'Covered' column exists
if 'Covered' not in df.columns:
raise ValueError(f"CSV file {csv_path} does not have a 'Covered' column")
# Count rows where Covered is True
numerator = int((df['Covered'] == True).sum())
# Total number of data rows (excluding header)
denominator = len(df)
# Calculate fraction
if denominator == 0:
fraction = 0.0
else:
fraction = numerator / denominator
return numerator, denominator, fraction
def calculate_statistics(csv_dir: Path) -> None:
"""
Calculate and print coverage statistics for all CSV files in a directory.
Args:
csv_dir: Directory containing CSV files
"""
# Find all CSV files, excluding lock files
csv_files = [
f for f in csv_dir.glob('*.csv')
if not f.name.startswith('.~lock')
]
if not csv_files:
print(f"No CSV files found in {csv_dir}")
return
print(f"Processing {len(csv_files)} CSV file(s)...\n")
fractions: List[float] = []
total_numerator = 0
total_denominator = 0
for csv_file in sorted(csv_files):
try:
numerator, denominator, fraction = process_csv_file(csv_file)
if denominator == 0:
print(f"{csv_file.name}:")
print(f" WARNING: No data rows found, skipping...\n")
continue
print(f"{csv_file.name}:")
print(f" Covered entries: {numerator}")
print(f" Total entries: {denominator}")
print(f" Fraction: {fraction:.4f} ({numerator}/{denominator})\n")
fractions.append(fraction)
total_numerator += numerator
total_denominator += denominator
except Exception as e:
print(f"{csv_file.name}:")
print(f" ERROR: {e}\n")
continue
if not fractions:
print("No valid CSV files processed.")
return
# Calculate statistics
avg_fraction = sum(fractions) / len(fractions)
if total_denominator == 0:
overall_fraction = 0.0
else:
overall_fraction = total_numerator / total_denominator
print("=" * 50)
print("STATISTICS:")
print("=" * 50)
print(f"1. Average of all fractions: {avg_fraction:.4f}")
print(f"2. Overall fraction (sum of numerators / sum of denominators):")
print(f" {overall_fraction:.4f} ({total_numerator}/{total_denominator})")
print("=" * 50)
def main() -> None:
"""Main entry point for the script."""
# Get directory path from command line or use script's directory
if len(sys.argv) > 1:
csv_dir = Path(sys.argv[1])
if not csv_dir.exists():
print(f"Error: Directory '{csv_dir}' does not exist.")
sys.exit(1)
if not csv_dir.is_dir():
print(f"Error: '{csv_dir}' is not a directory.")
sys.exit(1)
else:
# Use script's directory
csv_dir = Path(__file__).parent / 'results' / 'csv'
if not csv_dir.exists():
print(f"Error: Default directory '{csv_dir}' does not exist.")
sys.exit(1)
calculate_statistics(csv_dir)
if __name__ == "__main__":
main()