feat: Add logging and update to ydata_profiling

100mi · 100mi · commit d8ae96cfb10e · 2023-09-10T16:41:32.000+05:30
diff --git a/.Dockerignore b/.Dockerignore
@@ -78,6 +78,7 @@ target/
 # Virtual environment
 .venv/
 venv/
+.vscode/
 
 # PyCharm
 .idea
@@ -101,7 +102,6 @@ venv/
 README.md
 
 # Library dependecy metadata
-poetry.lock
 
 # github workflows
 .github/
@@ -112,4 +112,11 @@ poetry.lock
 volumes/
 
 # Task
-tasks/
+tasks/
+
+# Example
+app/example/
+
+# Gitpod
+scripts/gitpod*
+scripts/codespaces*
diff --git a/.gitignore b/.gitignore
@@ -162,6 +162,7 @@ cython_debug/
 !.vscode/launch.json
 !.vscode/extensions.json
 !.vscode/*.code-snippets
+.vscode
 
 # Local History for Visual Studio Code
 .history/
@@ -175,4 +176,7 @@ cython_debug/
 .ionide
 
 # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode
-n
+n
+
+# Example
+app/example/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,6 +9,6 @@ repos:
     hooks:
     - id: flake8
 -   repo: https://github.com/timothycrosley/isort
-    rev: 5.9.3
+    rev: 5.12.0
     hooks:
     - id: isort
diff --git a/Dockerfile b/Dockerfile
@@ -1,22 +1,20 @@
-FROM tiangolo/uvicorn-gunicorn-fastapi:python3.9
+FROM python:3.10-slim-buster as requirements-stage
 
-WORKDIR /app
+WORKDIR /tmp
+RUN pip install poetry
+COPY ./pyproject.toml ./poetry.lock* /tmp/
 
-ENV POETRY_VERSION=1.2.0
+RUN mkdir -p /tmp/app
+COPY ./app /tmp/app
 
-# Install Poetry
-RUN curl -sSL  https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
-    cd /usr/local/bin && \
-    ln -s /opt/poetry/bin/poetry && \
-    poetry config experimental.new-installer false && \
-    poetry config virtualenvs.create false
+RUN poetry export -f requirements.txt --output requirements.txt --without-hashes
 
-# Copy poetry.lock* in case it doesn't exist in the repo
-COPY ./pyproject.toml ./poetry.lock* /
 
-# Allow installing dev dependencies to run tests
-ARG INSTALL_DEV=false
-RUN bash -c "if [ $INSTALL_DEV == 'true' ] ; then poetry install --no-root ; else poetry install --no-root --no-dev ; fi"
+FROM python:3.10-slim-buster
 
-COPY . .
-ENV PYTHONPATH=/app
+WORKDIR /code
+
+COPY --from=requirements-stage /tmp/requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+
+COPY --from=requirements-stage /tmp/app /code/app
diff --git a/Dockerfile.dev b/Dockerfile.dev
@@ -1,22 +1,19 @@
-FROM tiangolo/uvicorn-gunicorn-fastapi:python3.9
+FROM tiangolo/uvicorn-gunicorn-fastapi:python3.10
 
 WORKDIR /app
 
-ENV POETRY_VERSION=1.2.0
-
 # Install Poetry
-RUN curl -sSL  https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
+RUN curl -sSL https://install.python-poetry.org/ | POETRY_HOME=/opt/poetry python && \
     cd /usr/local/bin && \
     ln -s /opt/poetry/bin/poetry && \
-    poetry config experimental.new-installer false && \
     poetry config virtualenvs.create false
 
 # Copy poetry.lock* in case it doesn't exist in the repo
-COPY ./pyproject.toml ./poetry.lock* /
+COPY ./pyproject.toml  /
 
 # Allow installing dev dependencies to run tests
 ARG INSTALL_DEV=false
-RUN bash -c "if [ $INSTALL_DEV == 'true' ] ; then poetry install --no-root ; else poetry install --no-root --no-dev ; fi"
+RUN bash -c "if [ $INSTALL_DEV == 'true' ] ; then poetry install --no-root ; else poetry install --no-root --only main ; fi"
 
 COPY . .
 ENV PYTHONPATH=/app
diff --git a/app/api/api_v1/routers/profile.py b/app/api/api_v1/routers/profile.py
@@ -2,7 +2,7 @@
 from typing import List
 
 from fastapi import APIRouter, Depends
-from pandas_profiling import ProfileReport
+from ydata_profiling import ProfileReport
 
 from app.core.config import Settings
 from app.models.alerts import Alerts
diff --git a/app/core/config.py b/app/core/config.py
@@ -46,5 +46,12 @@ class Settings(BaseSettings):
     # PROFILE SEGMENTS
     SAMPLE_DATA_RENDERER: List[str] = ["head"]
 
+    # LOGGING SETTINGS
+    LOG_LEVEL: str = "DEBUG"
+    LOG_FILE_PATH: str = "logs/app.log"
+    LOG_FILE_SIZE: int = 100_000_000  # 100MB
+    LOG_FILE_BACKUP_COUNT: int = 5
+    LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
     class Config:
         env_file = ".env"
diff --git a/app/core/logging.py b/app/core/logging.py
@@ -0,0 +1,56 @@
+import logging
+import logging.config
+import os
+
+from app.core.config import Settings
+
+settings = Settings()
+
+# Create the logs directory if it doesn't exist
+log_directory = os.path.dirname(settings.LOG_FILE_PATH)
+if not os.path.exists(log_directory):
+    os.makedirs(log_directory)
+
+# Configuration dictionary for logging
+LOGGING_CONFIG = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "formatters": {
+        "default": {
+            "format": "%(asctime)s [%(levelname)s] [%(name)s:%(lineno)d] - %(message)s",  # noqa: E501
+            "datefmt": "%Y-%m-%d %H:%M:%S",
+        },
+    },
+    "handlers": {
+        "console": {
+            "class": "rich.logging.RichHandler",
+            "level": settings.LOG_LEVEL,
+        },
+    },
+    "loggers": {
+        "": {
+            "level": settings.LOG_LEVEL,
+            "handlers": ["console"],
+            "propagate": True,
+        },
+        "celery": {
+            "level": settings.LOG_LEVEL,
+            "handlers": ["console"],
+            "propagate": True,
+        },
+    },
+}
+
+# Load the logging configuration
+logging.config.dictConfig(LOGGING_CONFIG)
+
+
+def get_logger(name: str) -> logging.Logger:
+    """
+    Get a logger with the specified name.
+    Args:
+        name (str): The name of the logger.
+    Returns:
+        logging.Logger: The logger instance.
+    """
+    return logging.getLogger(name)
diff --git a/app/models/analysis.py b/app/models/analysis.py
@@ -1,4 +1,4 @@
-from datetime import datetime, timedelta
+from datetime import datetime
 
 from pydantic.main import BaseModel
 
@@ -7,7 +7,8 @@ class Analysis(BaseModel):
     title: str
     date_start: datetime
     date_end: datetime
-    duration: timedelta
+    # TIME DELTA IS REMOVED IN PROFILING VERSION 4 AND ABOVE
+    # duration: timedelta
 
     class Config:
         underscore_attrs_are_private = True
diff --git a/app/models/correlations.py b/app/models/correlations.py
@@ -11,6 +11,7 @@ class Correlations(BaseModel):
     kendall: Optional[Union[Json, Dict]]
     cramers: Optional[Union[Json, Dict]]
     phi_k: Optional[Union[Json, Dict]]
+    # auto: Optional[Union[Json, Dict, Any]]
 
     class Config:
         underscore_attrs_are_private = True
diff --git a/app/models/duplicates.py b/app/models/duplicates.py
@@ -1,7 +1,7 @@
-from typing import Union
+from typing import Any
 
-from pydantic import BaseModel, Json
+from pydantic import BaseModel
 
 
 class Duplicates(BaseModel):
-    __root__: Union[Json, str]
+    __root__: Any
diff --git a/app/models/package.py b/app/models/package.py
@@ -4,8 +4,8 @@
 
 
 class Package(BaseModel):
-    pandas_profiling_version: str
-    pandas_profiling_config: str
+    ydata_profiling_version: str
+    ydata_profiling_version: str
 
     class Config:
         underscore_attrs_are_private = True
diff --git a/app/models/scatter.py b/app/models/scatter.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+from typing import Any, Dict
+
 from pydantic import BaseModel
 
 
 class Scatter(BaseModel):
-    pass
+    data: Dict[str, Any]
diff --git a/app/models/variables.py b/app/models/variables.py
@@ -29,8 +29,8 @@ class VariableProperties(BaseModel):
     mean: Optional[float]
     std: Optional[float]
     variance: Optional[float]
-    min: Optional[int]
-    max: Optional[float]
+    min: Optional[Any]
+    max: Optional[Any]
     kurtosis: Optional[float]
     skewness: Optional[float]
     sum: Optional[float]
diff --git a/app/utils/dataframes.py b/app/utils/dataframes.py
@@ -7,13 +7,14 @@
 import polars.exceptions as pl_exc
 import s3fs
 from charset_normalizer import from_bytes
-from fastapi.logger import logger
 from numpy import bool_
 from requests import get
 
 from app.core.config import Settings
+from app.core.logging import get_logger
 
 setting = Settings()
+logger = get_logger(__name__)
 
 
 def get_encoding(obj: Union[str, bytes], is_object=False) -> str:
@@ -50,7 +51,8 @@ async def get_dataframe_honouring_encoding_async(
     try:
         df = pl.read_csv(source, null_values="NA", infer_schema_length=0)
     except (UnicodeDecodeError, pl_exc.ComputeError) as err:
-        logger.error(f"Could not interpret File encoding : {err}")
+        logger.warning(f"File encoding is not default: {err}")
+        logger.warning("Trying to read file with proper encoding")
         encoding = get_encoding(obj=source, is_object=is_object)
         logger.info(f"File encoding : {encoding}")
         df = pl.read_csv(
@@ -122,7 +124,9 @@ async def get_dataframe_async(file_url: str):
     url = urlparse(file_url)
 
     if url.scheme == "http" or url.scheme == "https":
+        logger.info("Check for files with http/https extension")
         df = await get_dataframe_honouring_encoding_async(file_url)
+        logger.info("Dataframe generated from http/https file")
         return df
 
     elif url.scheme == "s3":
@@ -132,12 +136,19 @@ async def get_dataframe_async(file_url: str):
             secret=setting.S3_SECRET_ACCESS_KEY,
             client_kwargs={"endpoint_url": setting.S3_ENDPOINT_URL},
         )
-
-        with fs.open(f"{url.netloc}{url.path}", "rb") as f:
-            obj = f.read()
-
-        df = await get_dataframe_honouring_encoding_async(obj, is_object=True)
-        return df
+        try:
+            with fs.open(f"{url.netloc}{url.path}", "rb") as f:
+                obj = f.read()
+                logger.info(f"File read from s3 : {url.path}")
+        except Exception as err:
+            logger.error("Could not read file from s3")
+            raise err
+        else:
+            df = await get_dataframe_honouring_encoding_async(
+                obj, is_object=True
+            )
+            logger.info("Dataframe generated from s3 file")
+            return df
 
 
 def get_dataframe(file_url: str):
@@ -156,7 +167,9 @@ def get_dataframe(file_url: str):
     url = urlparse(file_url)
 
     if url.scheme == "http" or url.scheme == "https":
+        logger.info("Check for files with http/https extension")
         df = get_dataframe_honouring_encoding(source=file_url, is_object=False)
+        logger.info("Dataframe generated from http/https file")
         return df
 
     elif url.scheme == "s3":
@@ -166,10 +179,15 @@ def get_dataframe(file_url: str):
             secret=setting.S3_SECRET_ACCESS_KEY,
             client_kwargs={"endpoint_url": setting.S3_ENDPOINT_URL},
         )
-
-        with fs.open(f"{url.netloc}{url.path}", "rb") as f:
-            file_content = f.read()
-        df = get_dataframe_honouring_encoding(
-            source=file_content, is_object=True
-        )
-        return df
+        try:
+            with fs.open(f"{url.netloc}{url.path}", "rb") as f:
+                file_content = f.read()
+        except Exception as err:
+            logger.error("Could not read file from s3")
+            raise err
+        else:
+            df = get_dataframe_honouring_encoding(
+                source=file_content, is_object=True
+            )
+            logger.info("Dataframe generated from s3 file")
+            return df
diff --git a/app/utils/profile_db.py b/app/utils/profile_db.py
diff --git a/app/utils/profile_segments.py b/app/utils/profile_segments.py
diff --git a/app/utils/tasks.py b/app/utils/tasks.py
diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml
diff --git a/docker-compose.yml b/docker-compose.yml
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml