Vaibhav2154 · Vaibhav2154 · Aug 27, 2025 · Aug 26, 2025 · Aug 26, 2025
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -0,0 +1,20 @@
+beautifulsoup4==4.13.4
+docx2pdf==0.1.8
+fastapi==0.116.1
+fpdf==1.7.2
+matplotlib==3.10.5
+numpy==2.3.2
+pandas==2.3.2
+Pillow==11.3.0
+plyer==2.1.0
+pydantic==2.11.7
+pydantic_settings==2.10.1
+PyPDF2==3.0.1
+qrcode==8.2
+Requests==2.32.5
+schedule==1.2.2
+seaborn==0.13.2
+SQLAlchemy==2.0.43
+sqlmodel==0.0.24
+watchdog==6.0.0
+yt_dlp==2025.8.22
diff --git a/backend/scripts/MachineLearning/prediction.py b/backend/scripts/MachineLearning/prediction.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+import re
+import sys
+import os
+import logging
+from datetime import datetime, timedelta
+
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestRegressor
+import matplotlib.pyplot as plt
+
+from backend.scripts.data_tools.data_converter import DataConverter
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+
+class SalesPredictor:
+    def __init__(self):
+        self.model = None
+        self.df = None
+
+    # ---------------- Extraction ----------------
+    def extract_pdf(self, path):
+        text = pdf_extract_text(path)
+        logging.info(f"Extracted {len(text)} characters from PDF")
+        return text
+
+    def extract_docx(self, path):
+        doc = docx.Document(path)
+        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+        text = "\n".join(paragraphs)
+        logging.info(f"Extracted {len(paragraphs)} paragraphs from DOCX")
+        return text
+
+    def extract_image(self, path):
+        img = Image.open(path)
+        text = pytesseract.image_to_string(img)
+        logging.info(f"Extracted {len(text)} characters from image")
+        return text
+
+    # ---------------- Parsing ----------------
+    DATE_PATTERNS = [
+        r"(?P<date>\d{4}[-/]\d{1,2}[-/]\d{1,2})",  # YYYY-MM-DD
+        r"(?P<date>\d{1,2}[-/]\d{1,2}[-/]\d{4})",  # DD-MM-YYYY
+        r"(?P<date>\d{4}[-/]\d{1,2})",             # YYYY-MM
+    ]
+    NUMBER_PATTERN = r"(?P<num>\d{1,3}(?:[,\s]\d{3})*(?:\.\d+)?|\d+(?:\.\d+)?)"
+
+    def try_parse_date(self, s):
+        for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%m-%d-%Y", "%Y-%m", "%Y/%m"):
+            try:
+                return datetime.strptime(s, fmt)
+            except Exception:
+                continue
+        return None
+
+    def parse_sales(self, text):
+        rows = []
+        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+        for ln in lines:
+            date_found, num = None, None
+            for pat in self.DATE_PATTERNS:
+                m = re.search(pat, ln)
+                if m:
+                    dt = self.try_parse_date(m.group("date"))
+                    if dt:
+                        date_found = dt
+                        break
+            nums = re.findall(self.NUMBER_PATTERN, ln)
+            if nums:
+                try:
+                    num = float(nums[0].replace(",", "").replace(" ", ""))
+                except Exception:
+                    num = None
+            if date_found and num is not None:
+                rows.append({"date": pd.to_datetime(date_found), "sales": num})
+
+        if not rows:
+            logging.warning("No sales data found")
+            return pd.DataFrame(columns=["date", "sales"])
+        df = pd.DataFrame(rows).drop_duplicates().sort_values("date")
+        return df
+
+    # ---------------- Model ----------------
+    def train(self, df):
+        df = df.dropna()
+        df["date"] = pd.to_datetime(df["date"], errors="coerce")
+        df = df.dropna(subset=["date", "sales"])  # ensure valid rows only
+
+        # create time index feature
+        df["t"] = (df["date"] - df["date"].min()).dt.days
+
+        X = df[["t"]].values
+        y = df["sales"].values
+
+        # ✅ Use RandomForest instead of Linear Regression
+        model = RandomForestRegressor(
+            n_estimators=200, 
+            random_state=42,
+            max_depth=None,
+            n_jobs=-1
+        )
+        model.fit(X, y)
+
+        self.model = model
+        self.df = df
+        logging.info("RandomForest model trained on %d samples", len(df))
+        return model
+
+
+    def forecast(self, days=30):
+        if self.model is None or self.df is None:
+            raise RuntimeError("Model not trained")
+        last_t = self.df["t"].max()
+        future_dates = [self.df["date"].max() + timedelta(days=i) for i in range(1, days + 1)]
+        future_t = np.array([last_t + i for i in range(1, days + 1)]).reshape(-1, 1)
+        preds = self.model.predict(future_t)
+        return pd.DataFrame({"date": future_dates, "sales": preds})
+
+    def plot_forecast(self, forecast_df, output_path="forecast.png"):
+        plt.figure(figsize=(10, 6))
+        plt.plot(self.df["date"], self.df["sales"], label="Historical", marker="o")
+        plt.plot(forecast_df["date"], forecast_df["sales"], label="Forecast", linestyle="--", marker="x")
+        plt.legend()
+        plt.grid(True)
+        plt.title("Sales Forecast")
+        plt.xlabel("Date")
+        plt.ylabel("Sales")
+        plt.savefig(output_path)
+        plt.close()
+        logging.info(f"Saved forecast plot to {output_path}")
+
+
+# ---------------- CLI ----------------
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python prediction.py <command> <file> [options]")
+        print("Commands:")
+        print("  csv <path>")
+        print("  pdf <path>")
+        print("  docx <path>")
+        print("  image <path>")
+        print("  xml <path>")
+        print("  json <path>")
+        print("Example: python prediction.py pdf sales.pdf")
+        sys.exit(1)
+
+    predictor = SalesPredictor()
+    command, path = sys.argv[1], sys.argv[2]
+
+    if command == "csv" or command == "xml" or command == "json":
+        df = DataConverter.auto_read(path)
+    elif command == "pdf":
+        text = predictor.extract_pdf(path)
+        df = predictor.parse_sales(text)
+    elif command == "docx":
+        text = predictor.extract_docx(path)
+        df = predictor.parse_sales(text)
+    elif command == "image":
+        text = predictor.extract_image(path)
+        df = predictor.parse_sales(text)
+    else:
+        print("Unknown command")
+        sys.exit(1)
+
+    if df is not None and not df.empty:
+        predictor.train(df)
+        forecast_df = predictor.forecast(30)
+        predictor.plot_forecast(forecast_df)
+        print("Forecast completed. Saved plot as forecast.png")
+    else:
+        print("No data extracted.")
diff --git a/backend/scripts/requirements.txt b/backend/scripts/requirements.txt
@@ -0,0 +1,20 @@
+beautifulsoup4==4.13.5
+docx2pdf==0.1.8
+fpdf==1.7.2
+matplotlib==3.10.5
+numpy==2.3.2
+pandas==2.3.2
+pdfminer==20191125
+pdfminer_six==20250506
+Pillow==11.3.0
+plyer==2.1.0
+PyPDF2==3.0.1
+pytesseract==0.3.13
+python_docx==1.2.0
+qrcode==8.2
+Requests==2.32.5
+schedule==1.2.2
+scikit_learn==1.7.1
+seaborn==0.13.2
+watchdog==6.0.0
+yt_dlp==2025.8.22