Skip to content

Commit d8ae96c

Browse files
committed
feat: Add logging and update to ydata_profiling
1 parent 2201ce5 commit d8ae96c

22 files changed

+2954
-1844
lines changed

.Dockerignore

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ target/
7878
# Virtual environment
7979
.venv/
8080
venv/
81+
.vscode/
8182

8283
# PyCharm
8384
.idea
@@ -101,7 +102,6 @@ venv/
101102
README.md
102103

103104
# Library dependecy metadata
104-
poetry.lock
105105

106106
# github workflows
107107
.github/
@@ -112,4 +112,11 @@ poetry.lock
112112
volumes/
113113

114114
# Task
115-
tasks/
115+
tasks/
116+
117+
# Example
118+
app/example/
119+
120+
# Gitpod
121+
scripts/gitpod*
122+
scripts/codespaces*

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ cython_debug/
162162
!.vscode/launch.json
163163
!.vscode/extensions.json
164164
!.vscode/*.code-snippets
165+
.vscode
165166

166167
# Local History for Visual Studio Code
167168
.history/
@@ -175,4 +176,7 @@ cython_debug/
175176
.ionide
176177

177178
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode
178-
n
179+
n
180+
181+
# Example
182+
app/example/

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ repos:
99
hooks:
1010
- id: flake8
1111
- repo: https://github.com/timothycrosley/isort
12-
rev: 5.9.3
12+
rev: 5.12.0
1313
hooks:
1414
- id: isort

Dockerfile

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,20 @@
1-
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.9
1+
FROM python:3.10-slim-buster as requirements-stage
22

3-
WORKDIR /app
3+
WORKDIR /tmp
4+
RUN pip install poetry
5+
COPY ./pyproject.toml ./poetry.lock* /tmp/
46

5-
ENV POETRY_VERSION=1.2.0
7+
RUN mkdir -p /tmp/app
8+
COPY ./app /tmp/app
69

7-
# Install Poetry
8-
RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
9-
cd /usr/local/bin && \
10-
ln -s /opt/poetry/bin/poetry && \
11-
poetry config experimental.new-installer false && \
12-
poetry config virtualenvs.create false
10+
RUN poetry export -f requirements.txt --output requirements.txt --without-hashes
1311

14-
# Copy poetry.lock* in case it doesn't exist in the repo
15-
COPY ./pyproject.toml ./poetry.lock* /
1612

17-
# Allow installing dev dependencies to run tests
18-
ARG INSTALL_DEV=false
19-
RUN bash -c "if [ $INSTALL_DEV == 'true' ] ; then poetry install --no-root ; else poetry install --no-root --no-dev ; fi"
13+
FROM python:3.10-slim-buster
2014

21-
COPY . .
22-
ENV PYTHONPATH=/app
15+
WORKDIR /code
16+
17+
COPY --from=requirements-stage /tmp/requirements.txt /code/requirements.txt
18+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
19+
20+
COPY --from=requirements-stage /tmp/app /code/app

Dockerfile.dev

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,19 @@
1-
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.9
1+
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.10
22

33
WORKDIR /app
44

5-
ENV POETRY_VERSION=1.2.0
6-
75
# Install Poetry
8-
RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
6+
RUN curl -sSL https://install.python-poetry.org/ | POETRY_HOME=/opt/poetry python && \
97
cd /usr/local/bin && \
108
ln -s /opt/poetry/bin/poetry && \
11-
poetry config experimental.new-installer false && \
129
poetry config virtualenvs.create false
1310

1411
# Copy poetry.lock* in case it doesn't exist in the repo
15-
COPY ./pyproject.toml ./poetry.lock* /
12+
COPY ./pyproject.toml /
1613

1714
# Allow installing dev dependencies to run tests
1815
ARG INSTALL_DEV=false
19-
RUN bash -c "if [ $INSTALL_DEV == 'true' ] ; then poetry install --no-root ; else poetry install --no-root --no-dev ; fi"
16+
RUN bash -c "if [ $INSTALL_DEV == 'true' ] ; then poetry install --no-root ; else poetry install --no-root --only main ; fi"
2017

2118
COPY . .
2219
ENV PYTHONPATH=/app

app/api/api_v1/routers/profile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from typing import List
33

44
from fastapi import APIRouter, Depends
5-
from pandas_profiling import ProfileReport
5+
from ydata_profiling import ProfileReport
66

77
from app.core.config import Settings
88
from app.models.alerts import Alerts

app/core/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,12 @@ class Settings(BaseSettings):
4646
# PROFILE SEGMENTS
4747
SAMPLE_DATA_RENDERER: List[str] = ["head"]
4848

49+
# LOGGING SETTINGS
50+
LOG_LEVEL: str = "DEBUG"
51+
LOG_FILE_PATH: str = "logs/app.log"
52+
LOG_FILE_SIZE: int = 100_000_000 # 100MB
53+
LOG_FILE_BACKUP_COUNT: int = 5
54+
LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
55+
4956
class Config:
5057
env_file = ".env"

app/core/logging.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import logging
2+
import logging.config
3+
import os
4+
5+
from app.core.config import Settings
6+
7+
settings = Settings()
8+
9+
# Create the logs directory if it doesn't exist
10+
log_directory = os.path.dirname(settings.LOG_FILE_PATH)
11+
if not os.path.exists(log_directory):
12+
os.makedirs(log_directory)
13+
14+
# Configuration dictionary for logging
15+
LOGGING_CONFIG = {
16+
"version": 1,
17+
"disable_existing_loggers": False,
18+
"formatters": {
19+
"default": {
20+
"format": "%(asctime)s [%(levelname)s] [%(name)s:%(lineno)d] - %(message)s", # noqa: E501
21+
"datefmt": "%Y-%m-%d %H:%M:%S",
22+
},
23+
},
24+
"handlers": {
25+
"console": {
26+
"class": "rich.logging.RichHandler",
27+
"level": settings.LOG_LEVEL,
28+
},
29+
},
30+
"loggers": {
31+
"": {
32+
"level": settings.LOG_LEVEL,
33+
"handlers": ["console"],
34+
"propagate": True,
35+
},
36+
"celery": {
37+
"level": settings.LOG_LEVEL,
38+
"handlers": ["console"],
39+
"propagate": True,
40+
},
41+
},
42+
}
43+
44+
# Load the logging configuration
45+
logging.config.dictConfig(LOGGING_CONFIG)
46+
47+
48+
def get_logger(name: str) -> logging.Logger:
49+
"""
50+
Get a logger with the specified name.
51+
Args:
52+
name (str): The name of the logger.
53+
Returns:
54+
logging.Logger: The logger instance.
55+
"""
56+
return logging.getLogger(name)

app/models/analysis.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from datetime import datetime, timedelta
1+
from datetime import datetime
22

33
from pydantic.main import BaseModel
44

@@ -7,7 +7,8 @@ class Analysis(BaseModel):
77
title: str
88
date_start: datetime
99
date_end: datetime
10-
duration: timedelta
10+
# TIME DELTA IS REMOVED IN PROFILING VERSION 4 AND ABOVE
11+
# duration: timedelta
1112

1213
class Config:
1314
underscore_attrs_are_private = True

app/models/correlations.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ class Correlations(BaseModel):
1111
kendall: Optional[Union[Json, Dict]]
1212
cramers: Optional[Union[Json, Dict]]
1313
phi_k: Optional[Union[Json, Dict]]
14+
# auto: Optional[Union[Json, Dict, Any]]
1415

1516
class Config:
1617
underscore_attrs_are_private = True

app/models/duplicates.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
from typing import Union
1+
from typing import Any
22

3-
from pydantic import BaseModel, Json
3+
from pydantic import BaseModel
44

55

66
class Duplicates(BaseModel):
7-
__root__: Union[Json, str]
7+
__root__: Any

app/models/package.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44

55

66
class Package(BaseModel):
7-
pandas_profiling_version: str
8-
pandas_profiling_config: str
7+
ydata_profiling_version: str
8+
ydata_profiling_version: str
99

1010
class Config:
1111
underscore_attrs_are_private = True

app/models/scatter.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from __future__ import annotations
22

3+
from typing import Any, Dict
4+
35
from pydantic import BaseModel
46

57

68
class Scatter(BaseModel):
7-
pass
9+
data: Dict[str, Any]

app/models/variables.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ class VariableProperties(BaseModel):
2929
mean: Optional[float]
3030
std: Optional[float]
3131
variance: Optional[float]
32-
min: Optional[int]
33-
max: Optional[float]
32+
min: Optional[Any]
33+
max: Optional[Any]
3434
kurtosis: Optional[float]
3535
skewness: Optional[float]
3636
sum: Optional[float]

app/utils/dataframes.py

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@
77
import polars.exceptions as pl_exc
88
import s3fs
99
from charset_normalizer import from_bytes
10-
from fastapi.logger import logger
1110
from numpy import bool_
1211
from requests import get
1312

1413
from app.core.config import Settings
14+
from app.core.logging import get_logger
1515

1616
setting = Settings()
17+
logger = get_logger(__name__)
1718

1819

1920
def get_encoding(obj: Union[str, bytes], is_object=False) -> str:
@@ -50,7 +51,8 @@ async def get_dataframe_honouring_encoding_async(
5051
try:
5152
df = pl.read_csv(source, null_values="NA", infer_schema_length=0)
5253
except (UnicodeDecodeError, pl_exc.ComputeError) as err:
53-
logger.error(f"Could not interpret File encoding : {err}")
54+
logger.warning(f"File encoding is not default: {err}")
55+
logger.warning("Trying to read file with proper encoding")
5456
encoding = get_encoding(obj=source, is_object=is_object)
5557
logger.info(f"File encoding : {encoding}")
5658
df = pl.read_csv(
@@ -122,7 +124,9 @@ async def get_dataframe_async(file_url: str):
122124
url = urlparse(file_url)
123125

124126
if url.scheme == "http" or url.scheme == "https":
127+
logger.info("Check for files with http/https extension")
125128
df = await get_dataframe_honouring_encoding_async(file_url)
129+
logger.info("Dataframe generated from http/https file")
126130
return df
127131

128132
elif url.scheme == "s3":
@@ -132,12 +136,19 @@ async def get_dataframe_async(file_url: str):
132136
secret=setting.S3_SECRET_ACCESS_KEY,
133137
client_kwargs={"endpoint_url": setting.S3_ENDPOINT_URL},
134138
)
135-
136-
with fs.open(f"{url.netloc}{url.path}", "rb") as f:
137-
obj = f.read()
138-
139-
df = await get_dataframe_honouring_encoding_async(obj, is_object=True)
140-
return df
139+
try:
140+
with fs.open(f"{url.netloc}{url.path}", "rb") as f:
141+
obj = f.read()
142+
logger.info(f"File read from s3 : {url.path}")
143+
except Exception as err:
144+
logger.error("Could not read file from s3")
145+
raise err
146+
else:
147+
df = await get_dataframe_honouring_encoding_async(
148+
obj, is_object=True
149+
)
150+
logger.info("Dataframe generated from s3 file")
151+
return df
141152

142153

143154
def get_dataframe(file_url: str):
@@ -156,7 +167,9 @@ def get_dataframe(file_url: str):
156167
url = urlparse(file_url)
157168

158169
if url.scheme == "http" or url.scheme == "https":
170+
logger.info("Check for files with http/https extension")
159171
df = get_dataframe_honouring_encoding(source=file_url, is_object=False)
172+
logger.info("Dataframe generated from http/https file")
160173
return df
161174

162175
elif url.scheme == "s3":
@@ -166,10 +179,15 @@ def get_dataframe(file_url: str):
166179
secret=setting.S3_SECRET_ACCESS_KEY,
167180
client_kwargs={"endpoint_url": setting.S3_ENDPOINT_URL},
168181
)
169-
170-
with fs.open(f"{url.netloc}{url.path}", "rb") as f:
171-
file_content = f.read()
172-
df = get_dataframe_honouring_encoding(
173-
source=file_content, is_object=True
174-
)
175-
return df
182+
try:
183+
with fs.open(f"{url.netloc}{url.path}", "rb") as f:
184+
file_content = f.read()
185+
except Exception as err:
186+
logger.error("Could not read file from s3")
187+
raise err
188+
else:
189+
df = get_dataframe_honouring_encoding(
190+
source=file_content, is_object=True
191+
)
192+
logger.info("Dataframe generated from s3 file")
193+
return df

0 commit comments

Comments
 (0)