graphql-coverage/calculate_csv_coverage_stats.py at main · pligor/graphql-coverage · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
"""Calculate coverage statistics from GraphQL schema coverage CSV files."""

from pathlib import Path
from typing import Tuple, List
import pandas as pd
import sys


def process_csv_file(csv_path: Path) -> Tuple[int, int, float]:
  """
  Process a single CSV file and calculate coverage statistics.

  Args:
    csv_path: Path to the CSV file

  Returns:
    Tuple of (numerator, denominator, fraction)
    numerator: Number of rows with Covered=True
    denominator: Total number of data rows (excluding header)
    fraction: numerator / denominator

  Raises:
    ValueError: If the CSV file doesn't have the expected structure
  """
  df = pd.read_csv(csv_path)

  # Check if 'Covered' column exists
  if 'Covered' not in df.columns:
    raise ValueError(f"CSV file {csv_path} does not have a 'Covered' column")

  # Count rows where Covered is True
  numerator = int((df['Covered'] == True).sum())

  # Total number of data rows (excluding header)
  denominator = len(df)

  # Calculate fraction
  if denominator == 0:
    fraction = 0.0
  else:
    fraction = numerator / denominator

  return numerator, denominator, fraction


def calculate_statistics(csv_dir: Path) -> None:
  """
  Calculate and print coverage statistics for all CSV files in a directory.

  Args:
    csv_dir: Directory containing CSV files
  """
  # Find all CSV files, excluding lock files
  csv_files = [
    f for f in csv_dir.glob('*.csv')
    if not f.name.startswith('.~lock')
  ]

  if not csv_files:
    print(f"No CSV files found in {csv_dir}")
    return

  print(f"Processing {len(csv_files)} CSV file(s)...\n")

  fractions: List[float] = []
  total_numerator = 0
  total_denominator = 0

  for csv_file in sorted(csv_files):
    try:
      numerator, denominator, fraction = process_csv_file(csv_file)

      if denominator == 0:
        print(f"{csv_file.name}:")
        print(f"  WARNING: No data rows found, skipping...\n")
        continue

      print(f"{csv_file.name}:")
      print(f"  Covered entries: {numerator}")
      print(f"  Total entries: {denominator}")
      print(f"  Fraction: {fraction:.4f} ({numerator}/{denominator})\n")

      fractions.append(fraction)
      total_numerator += numerator
      total_denominator += denominator

    except Exception as e:
      print(f"{csv_file.name}:")
      print(f"  ERROR: {e}\n")
      continue

  if not fractions:
    print("No valid CSV files processed.")
    return

  # Calculate statistics
  avg_fraction = sum(fractions) / len(fractions)

  if total_denominator == 0:
    overall_fraction = 0.0
  else:
    overall_fraction = total_numerator / total_denominator

  print("=" * 50)
  print("STATISTICS:")
  print("=" * 50)
  print(f"1. Average of all fractions: {avg_fraction:.4f}")
  print(f"2. Overall fraction (sum of numerators / sum of denominators):")
  print(f"   {overall_fraction:.4f} ({total_numerator}/{total_denominator})")
  print("=" * 50)


def main() -> None:
  """Main entry point for the script."""
  # Get directory path from command line or use script's directory
  if len(sys.argv) > 1:
    csv_dir = Path(sys.argv[1])
    if not csv_dir.exists():
      print(f"Error: Directory '{csv_dir}' does not exist.")
      sys.exit(1)
    if not csv_dir.is_dir():
      print(f"Error: '{csv_dir}' is not a directory.")
      sys.exit(1)
  else:
    # Use script's directory
    csv_dir = Path(__file__).parent / 'results' / 'csv'
    if not csv_dir.exists():
      print(f"Error: Default directory '{csv_dir}' does not exist.")
      sys.exit(1)

  calculate_statistics(csv_dir)


if __name__ == "__main__":
  main()