Skip to content

Commit d9cd869

Browse files
gecBurtongeorge
andauthored
feature/python3.11 (#22)
* now works with python3.11 and automatically converts data to utf8 * updated README * updated poetry.lock --------- Co-authored-by: george <g.e.c.burton@gmal.com>
1 parent 3245c39 commit d9cd869

14 files changed

+357
-221
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# clean your CSVs!
22

33
This command line tool cleans CSV files by:
4-
1. detecting the encoding and converting it to utf-8
4+
1. converting encoding to utf-8
55
2. detecting the delimiter and safely converting it to a comma
66
3. casting all variables to json form, i.e. integers, floats, booleans, string or null.
77

88

99
* install `pip install csv-bleach`
10-
* and run like `python -m run csv_bleach my-data.csv`
10+
* and run like `python -m csv_bleach my-data.csv`
1111

1212
The only option is the output file name, by default it will be your original file name with `.scsv` extension.
1313

csv_bleach/__main__.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import click
66

7-
from csv_bleach.detect_encoding import detect_encoding
87
from csv_bleach.detect_row_count import detect_row_count
98
from csv_bleach.type_casting import infer_types
109

@@ -24,15 +23,12 @@ def cli(file: str, output: Optional[str]):
2423
output = f"{filepath}.scsv"
2524

2625
with open(file, "rb") as input_file:
27-
encoding = detect_encoding(input_file)
28-
29-
with open(file, encoding=encoding) as input_file:
3026
row_count = detect_row_count(input_file)
3127

32-
with open(file, encoding=encoding) as input_file:
28+
with open(file, "rb") as input_file:
3329
type_caster = infer_types(input_file)
3430

35-
with open(file, encoding=encoding) as input_file, open(output, "w") as output_file:
31+
with open(file, "rb") as input_file, open(output, "w") as output_file:
3632
type_caster.process_file(input_file, output_file, row_count)
3733

3834

csv_bleach/detect_delimiter.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
QUOTE = 34
88
NEW_LINE = 10
9-
9+
CARRIAGE_RETURN = 13
1010

1111
class DelimiterDetector:
1212
def __init__(self, delimiter_count: dict[str, int]):
@@ -25,7 +25,7 @@ def parse_row(cls, txt: str) -> DelimiterDetector:
2525
not escaped
2626
and not char.isalnum()
2727
and not (char.isspace() and prev and prev.isspace())
28-
and ord(char) not in (NEW_LINE, QUOTE)
28+
and ord(char) not in (NEW_LINE, QUOTE, CARRIAGE_RETURN)
2929
):
3030
chars.append(char)
3131

csv_bleach/detect_encoding.py

Lines changed: 0 additions & 20 deletions
This file was deleted.

csv_bleach/detect_row_count.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,5 @@
1-
from typing import IO
1+
from typing import BinaryIO
22

33

4-
def blocks(files, size=65536):
5-
while True:
6-
b = files.read(size)
7-
if not b:
8-
break
9-
yield b
10-
11-
12-
def detect_row_count(file: IO[str]) -> int:
13-
return sum(bl.count("\n") for bl in blocks(file))
4+
def detect_row_count(file: BinaryIO) -> int:
5+
return sum(1 for _ in file)

csv_bleach/type_casting.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import json
22
import logging
3-
from typing import Any, Iterator, List, TextIO
3+
from typing import Any, Iterator, List, BinaryIO, TextIO
44

55
import click
66

7+
from charset_normalizer import from_bytes
78
from csv_bleach.detect_delimiter import DelimiterDetector
89
from csv_bleach.line_decoder import LineSplit
910

@@ -53,13 +54,14 @@ def type_cast_row(self, i: int, txt: str) -> List[Any]:
5354

5455
return words
5556

56-
def parse_file(self, rows: TextIO) -> Iterator[list]:
57+
def parse_file(self, rows: BinaryIO) -> Iterator[list]:
5758
for i, row in enumerate(rows):
58-
if len(row.strip()) > 0:
59-
typed_row = self.type_cast_row(i, row)
59+
str_row = str(from_bytes(row).best())
60+
if len(str_row.strip()) > 0:
61+
typed_row = self.type_cast_row(i, str_row)
6062
yield typed_row
6163

62-
def process_file(self, input_file: TextIO, output_file: TextIO, row_count: int):
64+
def process_file(self, input_file: BinaryIO, output_file: TextIO, row_count: int):
6365
with click.progressbar(
6466
self.parse_file(input_file),
6567
length=row_count,
@@ -70,11 +72,12 @@ def process_file(self, input_file: TextIO, output_file: TextIO, row_count: int):
7072
output_file.write(json_row)
7173

7274

73-
def infer_types(rows: TextIO) -> TypeCaster:
75+
def infer_types(rows: BinaryIO) -> TypeCaster:
7476
def _read(_rows):
7577
for row in _rows:
76-
if len(row.strip()) > 0:
77-
yield DelimiterDetector.parse_row(row)
78+
str_row = str(from_bytes(row).best())
79+
if len(str_row.strip()) > 0:
80+
yield DelimiterDetector.parse_row(str_row)
7881

7982
dd = DelimiterDetector.combine(_read(rows))
8083
assert len(dd.delimiter_count) == 1, dd.delimiter_count

0 commit comments

Comments
 (0)