diff --git a/backend/.gitignore b/backend/.gitignore new file mode 100644 index 00000000..0a197900 --- /dev/null +++ b/backend/.gitignore @@ -0,0 +1,174 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc diff --git a/backend/scripts/automation/1.py b/backend/scripts/automation/1.py index 4224137c..bab5b958 100644 --- a/backend/scripts/automation/1.py +++ b/backend/scripts/automation/1.py @@ -1,9 +1,13 @@ import smtplib from email.message import EmailMessage +# Importing load_dotenv to manage environment variables +from dotenv import load_dotenv +import os -# --- Configuration --- -EMAIL_ADDRESS = 'vaibhavvaibhu2005@gmail.com' # Replace with your email -EMAIL_PASSWORD = '' # App password or actual password +load_dotenv() # Load from .env file + +EMAIL_ADDRESS = os.getenv('EMAIL_ADDRESS') +EMAIL_PASSWORD = os.getenv('EMAIL_PASSWORD') SUBJECT = 'Thank You & Welcome to the Advisory Team - Linux Campus Club (LCC)' # TODO: Update the advisory role below if needed diff --git a/backend/scripts/image_audio_video/data_processor.py b/backend/scripts/image_audio_video/data_processor.py new file mode 100644 index 00000000..71146916 --- /dev/null +++ b/backend/scripts/image_audio_video/data_processor.py @@ -0,0 +1,263 @@ +from __future__ import annotations +import os +import sys +from typing import Any, Dict, List, Optional, Union +import pandas as pd + + +class DataProcessor: + def __init__(self, verbose: bool = True): + self.verbose = verbose + + # ----------------- IO ----------------- + def read_data(self, file_path: str, chunk_size: Optional[int] = None) -> Union[pd.DataFrame, pd.io.parsers.TextFileReader]: + ext = os.path.splitext(file_path)[1].lower().lstrip('.') + if ext == 'csv': + return pd.read_csv(file_path, chunksize=chunk_size) if chunk_size else pd.read_csv(file_path) + if ext in ('json',): + if chunk_size: + return pd.read_json(file_path, lines=True, chunksize=chunk_size) + data = pd.read_json(file_path) + if isinstance(data, dict): + return pd.json_normalize(data) + return data + if ext in ('xls', 'xlsx'): + return pd.read_excel(file_path) + if ext == 'txt': + return pd.read_csv(file_path, delimiter='\t') + raise ValueError(f"Unsupported format: {ext}") + + def write_data(self, data: Union[pd.DataFrame, pd.io.parsers.TextFileReader], file_path: str, format_type: Optional[str] = None) -> None: + ext = (format_type or os.path.splitext(file_path)[1].lower().lstrip('.')) + if ext == 'csv': + if hasattr(data, '__iter__') and not isinstance(data, pd.DataFrame): + first = True + for chunk in data: + chunk.to_csv(file_path, index=False, mode='w' if first else 'a', header=first) + first = False + else: + data.to_csv(file_path, index=False) + elif ext == 'json': + if hasattr(data, '__iter__') and not isinstance(data, pd.DataFrame): + with open(file_path, 'w', encoding='utf-8') as f: + for chunk in data: + for rec in chunk.to_dict(orient='records'): + f.write(pd.io.json.dumps(rec, force_ascii=False) + '\n') + else: + data.to_json(file_path, orient='records', force_ascii=False, indent=2) + elif ext in ('xls', 'xlsx'): + with pd.ExcelWriter(file_path, engine='openpyxl') as writer: + data.to_excel(writer, index=False) + else: + raise ValueError(f"Unsupported format: {ext}") + if self.verbose: + print(f"Data saved to {file_path}") + + # ----------------- Info ----------------- + def get_data_info(self, data: pd.DataFrame) -> Dict[str, Any]: + return { + "rows": int(data.shape[0]), + "columns": int(data.shape[1]), + "dtypes": {c: str(t) for c, t in data.dtypes.to_dict().items()}, + "null_counts": data.isna().sum().to_dict(), + "duplicates": int(data.duplicated().sum()), + "memory_usage": int(data.memory_usage(deep=True).sum()) + } + + def preview_data(self, data: pd.DataFrame, n: int = 5) -> pd.DataFrame: + return data.head(n) + + def get_shape(self, data: pd.DataFrame) -> tuple: + return data.shape + + # ----------------- Cleaning ----------------- + def clean_data(self, data: pd.DataFrame, operations: Optional[Dict[str, Any]] = None) -> pd.DataFrame: + df = data.copy() + if operations is None: + operations = {"drop_duplicates": True, "fill_nulls": True, "strip_strings": True} + + if operations.get("drop_duplicates"): + before = len(df) + df = df.drop_duplicates() + if self.verbose: + print(f"Removed {before - len(df)} duplicate rows") + + if operations.get("fill_nulls"): + for col in df.columns: + if df[col].dtype == 'object': + nulls = df[col].isna().sum() + df[col] = df[col].fillna("Unknown") + if self.verbose and nulls > 0: + print(f"Filled {nulls} nulls in column '{col}' with 'Unknown'") + else: + nulls = df[col].isna().sum() + df[col] = df[col].fillna(df[col].mean()) + if self.verbose and nulls > 0: + print(f"Filled {nulls} nulls in column '{col}' with mean value") + + if operations.get("strip_strings"): + for col in df.select_dtypes(include=['object']).columns: + df[col] = df[col].astype(str).str.strip() + if self.verbose: + print(f"Stripped whitespace in column '{col}'") + + return df + + # ----------------- Filtering ----------------- + def filter_data(self, data: pd.DataFrame, conditions: List[Dict[str, Any]]) -> pd.DataFrame: + df = data.copy() + for cond in conditions: + col = cond["column"] + op = cond["operator"] + val = cond.get("value") + if op == "equals": + df = df[df[col] == val] + elif op == "not_equals": + df = df[df[col] != val] + elif op == "greater_than": + df = df[df[col] > val] + elif op == "less_than": + df = df[df[col] < val] + elif op == "contains": + df = df[df[col].astype(str).str.contains(str(val), na=False)] + elif op == "in": + df = df[df[col].isin(val)] + elif op == "between": + df = df[df[col].between(val[0], val[1])] + return df + + # ----------------- Aggregation / Merge / Pivot ----------------- + def aggregate_data(self, data: pd.DataFrame, group_by: Union[str, List[str]], aggregations: Dict[str, Union[str, List[str]]]) -> pd.DataFrame: + return data.groupby(group_by).agg(aggregations).reset_index() + + def merge_datasets(self, data1: pd.DataFrame, data2: pd.DataFrame, on: Optional[Union[str, List[str]]] = None, how: str = 'inner') -> pd.DataFrame: + return pd.merge(data1, data2, on=on, how=how) + + def pivot_data(self, data: pd.DataFrame, index: Union[str, List[str]], columns: Union[str, List[str]], values: Union[str, List[str]], aggfunc: str = 'sum') -> pd.DataFrame: + return pd.pivot_table(data, index=index, columns=columns, values=values, aggfunc=aggfunc).reset_index() + + # ----------------- Sorting / Sampling ----------------- + def sort_data(self, data: pd.DataFrame, columns: Union[str, List[str]], ascending: Union[bool, List[bool]] = True) -> pd.DataFrame: + return data.sort_values(by=columns, ascending=ascending) + + def sample_data(self, data: pd.DataFrame, n: Optional[int] = None, frac: Optional[float] = None, random_state: int = 42) -> pd.DataFrame: + return data.sample(n=n, frac=frac, random_state=random_state) + + # ----------------- Statistics ----------------- + def get_statistics(self, data: pd.DataFrame) -> Dict[str, Any]: + stats = {} + numeric_cols = data.select_dtypes(include=['number']).columns + for col in numeric_cols: + stats[col] = { + "count": int(data[col].count()), + "mean": float(data[col].mean()), + "median": float(data[col].median()), + "std": float(data[col].std()), + "min": float(data[col].min()), + "max": float(data[col].max()), + "quartiles": data[col].quantile([0.25, 0.5, 0.75]).to_dict() + } + return stats + + # ----------------- Type conversions ----------------- + def convert_data_types(self, data: pd.DataFrame, conversions: Dict[str, str]) -> pd.DataFrame: + df = data.copy() + for col, dtype in conversions.items(): + try: + if dtype == 'datetime': + df[col] = pd.to_datetime(df[col]) + else: + df[col] = df[col].astype(dtype) + if self.verbose: + print(f"Converted {col} to {dtype}") + except Exception as e: + print(f"Error converting {col} to {dtype}: {e}") + return df + + # ----------------- Sample data ----------------- + def create_sample_data(self) -> pd.DataFrame: + df = pd.DataFrame({ + 'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'], + 'age': [25, 30, 35, 28, 32], + 'city': ['New York', 'London', 'Tokyo', 'Paris', 'Sydney'], + 'salary': [50000, 60000, 75000, 55000, 65000], + 'department': ['IT', 'HR', 'IT', 'Finance', 'IT'] + }) + if self.verbose: + print("Sample data created") + return df + + +# ----------------- CLI ----------------- +if __name__ == "__main__": + processor = DataProcessor(verbose=True) + + if len(sys.argv) < 2: + print("Usage: python data_processor.py [args] [--preview n]") + sys.exit(1) + + cmd = sys.argv[1] + preview_rows = None + + # Check for preview flag + if "--preview" in sys.argv: + idx = sys.argv.index("--preview") + if len(sys.argv) > idx + 1: + preview_rows = int(sys.argv[idx + 1]) + + def maybe_preview(df: pd.DataFrame): + if preview_rows: + print(f"\nPreviewing first {preview_rows} rows:") + print(processor.preview_data(df, preview_rows)) + + if cmd == "info": + file_path = sys.argv[2] + df = processor.read_data(file_path) + maybe_preview(df) + info = processor.get_data_info(df) + print(f"\nDataset Information for {file_path}") + print("="*40) + print(f"Rows: {info['rows']}, Columns: {info['columns']}") + print(f"Memory usage: {info['memory_usage']:,} bytes") + print(f"Duplicates: {info['duplicates']}") + print("\nNull counts per column:") + for k, v in info['null_counts'].items(): + print(f" {k}: {v}") + + elif cmd == "clean": + input_file = sys.argv[2] + output_file = sys.argv[3] + df = processor.read_data(input_file) + maybe_preview(df) + cleaned = processor.clean_data(df) + processor.write_data(cleaned, output_file) + + elif cmd == "convert": + input_file = sys.argv[2] + output_file = sys.argv[3] + fmt = sys.argv[4] + df = processor.read_data(input_file) + maybe_preview(df) + processor.write_data(df, output_file, fmt) + + elif cmd == "stats": + file_path = sys.argv[2] + df = processor.read_data(file_path) + maybe_preview(df) + stats = processor.get_statistics(df) + print(stats) + + elif cmd == "sample": + file_path = sys.argv[2] + n = int(sys.argv[3]) + df = processor.read_data(file_path) + maybe_preview(df) + sample = processor.sample_data(df, n=n) + print(sample) + + elif cmd == "create_sample": + df = processor.create_sample_data() + maybe_preview(df) + + else: + print(f"Unknown command: {cmd}")