-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_duplicates.py
57 lines (47 loc) · 2.72 KB
/
count_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# Author: Eduard Fekete
# GitHub: https://github.com/Eddcapone/Duplicate-Line-Analyzer
import argparse
import pandas as pd
from collections import Counter
import os
from tqdm import tqdm
from tabulate import tabulate
import re
def count_duplicate_lines(file_path, top_n, max_chars, min_chars, exclude_pattern=None, include_pattern=None, min_count=1):
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
file_size = os.path.getsize(file_path)
lines = []
with open(file_path, 'r') as file:
progress_bar = tqdm(total=file_size, unit='B', unit_scale=True, desc="Processing file")
while True:
line = file.readline()
if not line:
break
progress_bar.update(len(line.encode('utf-8')))
stripped_line = line.strip()
if stripped_line and len(stripped_line) >= min_chars:
if include_pattern and not re.search(include_pattern, stripped_line):
continue # Skip lines that do not match the include pattern
if exclude_pattern and re.search(exclude_pattern, stripped_line):
continue # Skip lines that match the exclude pattern
lines.append(stripped_line[:max_chars])
progress_bar.close()
line_counts = Counter(lines)
df = pd.DataFrame(list(line_counts.items()), columns=['Line', 'Count'])
df.sort_values(by='Count', ascending=False, inplace=True)
# Filter the DataFrame to include only lines with counts >= min_count
filtered_df = df[df['Count'] >= min_count]
return filtered_df.head(top_n)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Count duplicate lines in a file with optional regex inclusion and exclusion.')
parser.add_argument('file_path', type=str, help='Path to the text file.')
parser.add_argument('top_n', type=int, help='Number of top results to display.')
parser.add_argument('max_chars', type=int, help='Maximum number of characters per line to consider.')
parser.add_argument('min_chars', type=int, help='Minimum number of characters a line must contain to be considered.')
parser.add_argument('--exclude', type=str, default=None, help='Regex pattern to exclude lines that match.')
parser.add_argument('--include', type=str, default=None, help='Regex pattern to include only lines that match.')
parser.add_argument('--min-count', type=int, default=1, help='Minimum count a line must have to be included in the results.')
args = parser.parse_args()
df = count_duplicate_lines(args.file_path, args.top_n, args.max_chars, args.min_chars, args.exclude, args.include, args.min_count)
print(tabulate(df, headers='keys', tablefmt='psql'))