From 67982ef7255dd8d415f3aa8de50d31b054914bf4 Mon Sep 17 00:00:00 2001
From: Pavan <pavan.bhatia578@gmail.com>
Date: Wed, 31 Aug 2022 20:16:49 +0530
Subject: [PATCH 1/4] eda changes

---
 vevestaX/__init__.py |   2 +-
 vevestaX/vevesta.py  | 156 ++++++++++++++++++++++---------------------
 2 files changed, 80 insertions(+), 78 deletions(-)

diff --git a/vevestaX/__init__.py b/vevestaX/__init__.py
index ff52e2e..c5a38a4 100644
--- a/vevestaX/__init__.py
+++ b/vevestaX/__init__.py
@@ -1 +1 @@
-__version__ = '6.8.0'
+__version__ = '6.8.1'
diff --git a/vevestaX/vevesta.py b/vevestaX/vevesta.py
index 01eab14..a8ca0f1 100644
--- a/vevestaX/vevesta.py
+++ b/vevestaX/vevesta.py
@@ -36,7 +36,7 @@ def test():
 
 
 class Experiment(object):
-    def __init__(self, speedUp=True):
+    def __init__(self, speedUp=False):
         self.__dataSourcing = None
         self.__featureEngineering = None
         self.__data = None
@@ -48,7 +48,7 @@ def __init__(self, speedUp=True):
         self.__filename = self.get_filename()
         self.__sampleSize = 0
         self.__Y=None
-        self.speedUp = speedUp
+        # self.speedUp = speedUp
 
     def get_filename(self):
         try:
@@ -179,26 +179,26 @@ def dataSourcing(self, value):
             self.__dataSourcing = value.columns.tolist()
             self.__data = value
             self.__sampleSize = len(value)
-            if self.speedUp == False:
-                self.__correlation = value.corr(method='pearson')
+            # if self.speedUp == False:
+            self.__correlation = value.corr(method='pearson')
 
         if isinstance(value, pysparkDataframe):
             self.__dataSourcing = value.columns
             self.__data = value
             self.__sampleSize = value.count()
-            if self.speedUp == False:
-                spark = SparkSession.builder.appName("vevesta").getOrCreate()
-                columnNames = []
-                columnNames = value.columns
-                for i in range(len(value.columns)):
-                    value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType()))
-                vectorCol = "corrfeatures"
-                assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol)
-                df_vector = assembler.transform(value).select(vectorCol)
-                matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0]
-                corrMatrix = matrix.toArray().tolist()
-                dfCorr = spark.createDataFrame(corrMatrix, columnNames)
-                self.__correlation = dfCorr
+            # if self.speedUp == False:
+            spark = SparkSession.builder.appName("vevesta").getOrCreate()
+            columnNames = []
+            columnNames = value.columns
+            for i in range(len(value.columns)):
+                value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType()))
+            vectorCol = "corrfeatures"
+            assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol)
+            df_vector = assembler.transform(value).select(vectorCol)
+            matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0]
+            corrMatrix = matrix.toArray().tolist()
+            dfCorr = spark.createDataFrame(corrMatrix, columnNames)
+            self.__correlation = dfCorr
 
     @property
     def ds(self):  # its doing the same work as dataSourcing do
@@ -238,23 +238,23 @@ def featureEngineering(self, value):
                 self.__featureEngineering = cols
 
         if isinstance(value, pandasDataframe):
-            if self.speedUp == False:
-                self.__correlation = value.corr(method='pearson')
+            # if self.speedUp == False:
+            self.__correlation = value.corr(method='pearson')
 
         if isinstance(value, pysparkDataframe):
-            if self.speedUp == False:
-                spark = SparkSession.builder.appName("vevesta").getOrCreate()
-                columnNames = []
-                columnNames = value.columns
-                for i in range(len(value.columns)):
-                    value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType()))
-                vectorCol = "corrfeatures"
-                assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol)
-                df_vector = assembler.transform(value).select(vectorCol)
-                matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0]
-                corrMatrix = matrix.toArray().tolist()
-                dfCorr = spark.createDataFrame(corrMatrix, columnNames)
-                self.__correlation = dfCorr
+            # if self.speedUp == False:
+            spark = SparkSession.builder.appName("vevesta").getOrCreate()
+            columnNames = []
+            columnNames = value.columns
+            for i in range(len(value.columns)):
+                value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType()))
+            vectorCol = "corrfeatures"
+            assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol)
+            df_vector = assembler.transform(value).select(vectorCol)
+            matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0]
+            corrMatrix = matrix.toArray().tolist()
+            dfCorr = spark.createDataFrame(corrMatrix, columnNames)
+            self.__correlation = dfCorr
 
     @property
     def fe(self):
@@ -421,8 +421,8 @@ def __textColor(self, val):
 
     def __profilingReport(self, fileName):
         sheetName = 'Profiling Report'
-        if self.speedUp:
-            return
+        # if self.speedUp:
+            # return
         if not isinstance(self.__data, pandasDataframe):
             return
         if self.__data.empty or len(self.__data) == 0:
@@ -528,10 +528,10 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes
 
         if (filename == None):
             filename = "vevesta.xlsx"
-            pdfFilename = "vevesta.pdf"
-        else:
-            pdfFilename=filename.split('.')
-            pdfFilename=pdfFilename[0]+'.pdf'
+            # pdfFilename = "vevesta.pdf"
+        # else:
+        #     pdfFilename=filename.split('.')
+        #     pdfFilename=pdfFilename[0]+'.pdf'
 
         # updating variables
         # when no V.start & v.end are not called, all variables in the code get tracked or in colab/kaggle where all variables will get tracked
@@ -634,26 +634,24 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes
             if isinstance(self.__data, pysparkDataframe):
                 sampledData.toPandas().to_excel(writer, sheet_name='sampledata', index=False)
 
-            if self.speedUp == False:
-                if self.__correlation is not None:
-                    if isinstance(sampledData, pandasDataframe):
-                        pandasDataframe(self.__correlation).style. \
-                            applymap(self.__colorCellExcel). \
-                            applymap(self.__textColor). \
-                            to_excel(writer, sheet_name='EDA-correlation', index=True)
-
-                    if isinstance(sampledData, pysparkDataframe):
-                        correlation = self.__correlation.toPandas()
-                        correlation.set_index(correlation.columns, inplace=True)
-                        pandasDataframe(correlation).style. \
-                            applymap(self.__colorCellExcel). \
-                            applymap(self.__textColor). \
-                            to_excel(writer, sheet_name='EDA-correlation', index=True)
+            # if self.speedUp == False:
+            if self.__correlation is not None:
+                if isinstance(sampledData, pandasDataframe):
+                    pandasDataframe(self.__correlation).style. \
+                        applymap(self.__colorCellExcel). \
+                        applymap(self.__textColor). \
+                        to_excel(writer, sheet_name='EDA-correlation', index=True)
+
+                if isinstance(sampledData, pysparkDataframe):
+                    correlation = self.__correlation.toPandas()
+                    correlation.set_index(correlation.columns, inplace=True)
+                    pandasDataframe(correlation).style. \
+                        applymap(self.__colorCellExcel). \
+                        applymap(self.__textColor). \
+                        to_excel(writer, sheet_name='EDA-correlation', index=True)
 
         self.__profilingReport(filename)
 
-        if self.speedUp == False:
-            self.__EDA(filename)
 
         self.__plot(filename)
 
@@ -704,25 +702,27 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes
             except Exception as e:
                 print('File not pushed to git')
 
-    def __EDA(self, fileName):
-        if isinstance(self.__data, pandasDataframe):
-            self.__EDAForPandas(fileName)
+    def EDA(self, df, fileName):
+        if isinstance(df, pandasDataframe):
+            self.__EDAForPandas(df, fileName)
 
-    def __EDAForPandas(self, fileName):
+    def __EDAForPandas(self, df, xlsxFilename):
 
-        if self.__data.empty or len(self.__data) == 0:
+        if df.empty or len(df) == 0:
             return
 
-        if not isinstance(self.__data, pandasDataframe):
+        if not isinstance(df, pandasDataframe):
             return
 
-        if (fileName == None):
-            return print("Error: Provide the Excel File to plot the models")
+        # if (xlsxFileName == None):
+            # return print("Error: Provide the Excel File to plot the models")
         
-        if fileName is None:
-            pdfFilename="vevesta.pdf"
+        if xlsxFilename is None:
+            xlsxFilename="EDA_Vevesta.xlsx"
+            pdfFilename="EDA_Vevesta.pdf"
         else:
-            pdfFilename=fileName.split('.')
+            # xlsxFilename = filename
+            pdfFilename=filename.split('.')
             pdfFilename=pdfFilename[0]+'.pdf'
 
         columnTextImgone = 'B2'
@@ -852,10 +852,14 @@ def __EDAForPandas(self, fileName):
         with open(pdfFilename,"wb") as f:          
             f.write(convert(file,layout_fun=layout_function))
 
-        
+        self.__writeEDADetailsToExcel(xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction)
 
-        if (isfile(fileName)):
-            workBook = load_workbook(fileName)
+    
+    def __writeEDADetailsToExcel(self, xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction):
+        # if ((xlsxFileName)):
+            
+            writer = ExcelWriter(xlsxFilename, engine='openpyxl')
+            workBook = writer.book
             workBook.create_sheet('EDA-missingValues')
             plotSheet = workBook['EDA-missingValues']
             img = Image(join(directoryToDumpData, ValueImageFile))
@@ -927,9 +931,7 @@ def __EDAForPandas(self, fileName):
                 pdfImage.anchor = columnTextImgone
                 pdfPlotsheet.add_image(pdfImage)   
             
-                         
-            workBook.save(fileName)
-        workBook.close()
+            writer.save()
 
     def __plot(self, fileName):
 
@@ -1021,11 +1023,12 @@ def __getExcelSheetData(self, fileName, sheetName):
                 modelingData = read_excel(fileName, sheet_name=sheetName, index_col=[])
                 return modelingData
 
+
     def commit(self, techniqueUsed, filename=None, message=None, version=None, projectId=None,
                repoName=None, branch=None):
         self.dump(techniqueUsed, filename=filename, message=message, version=version, showMessage=False, repoName=None)
         if filename is None:
-            pdfFilename="vevesta.pdf"
+            pdfFilename="EDA_Vevesta.pdf"
         else:
             pdfFilename=filename.split('.')
             pdfFilename=pdfFilename[0]+'.pdf'
@@ -1054,7 +1057,7 @@ def commit(self, techniqueUsed, filename=None, message=None, version=None, proje
         
         filename = self.get_filename()
         file_exists = exists(filename)
-        file1_exists=exists(pdfFilename)
+        eda_file_exists=exists(pdfFilename)
         if file_exists:
             files = {'file': open(filename, 'rb')}
             headers_for_file = {'Authorization': 'Bearer ' + token}
@@ -1063,7 +1066,7 @@ def commit(self, techniqueUsed, filename=None, message=None, version=None, proje
                                          files=files)
             attachments = list()
             attachments.append(response.json())
-        if file1_exists:
+        if eda_file_exists:
             files = {'file': open(pdfFilename, 'rb')}
             response = requests.post(url=backend_url + '/Attachments', headers=headers_for_file, params=params,
                                          files=files)
@@ -1082,8 +1085,7 @@ def commit(self, techniqueUsed, filename=None, message=None, version=None, proje
             "message": message,
             "modeling": self.__variables,
             "dataSourced": self.__dataSourcing,
-            "featureEngineered": self.__featureEngineering,
-            "speedUp": self.speedUp
+            "featureEngineered": self.__featureEngineering
         }
         
         if file_exists:

From 38b774c401234e22512e764086a711c70c13f875 Mon Sep 17 00:00:00 2001
From: Pavan <pavan.bhatia578@gmail.com>
Date: Thu, 1 Sep 2022 10:52:02 +0530
Subject: [PATCH 2/4] eda changes

---
 vevestaX/vevesta.py | 312 ++++++++++++++++++++++----------------------
 1 file changed, 155 insertions(+), 157 deletions(-)

diff --git a/vevestaX/vevesta.py b/vevestaX/vevesta.py
index a8ca0f1..a14ccf9 100644
--- a/vevestaX/vevesta.py
+++ b/vevestaX/vevesta.py
@@ -36,19 +36,15 @@ def test():
 
 
 class Experiment(object):
-    def __init__(self, speedUp=False):
+    def __init__(self):
         self.__dataSourcing = None
         self.__featureEngineering = None
-        self.__data = None
-        self.__correlation = None
-
+        
         self.__primitiveDataTypes = [int, str, float, bool]
         self.__startlocals = None
         self.__variables = {}
         self.__filename = self.get_filename()
         self.__sampleSize = 0
-        self.__Y=None
-        # self.speedUp = speedUp
 
     def get_filename(self):
         try:
@@ -177,28 +173,10 @@ def dataSourcing(self):
     def dataSourcing(self, value):
         if isinstance(value, pandasDataframe):
             self.__dataSourcing = value.columns.tolist()
-            self.__data = value
-            self.__sampleSize = len(value)
-            # if self.speedUp == False:
-            self.__correlation = value.corr(method='pearson')
 
         if isinstance(value, pysparkDataframe):
             self.__dataSourcing = value.columns
-            self.__data = value
-            self.__sampleSize = value.count()
-            # if self.speedUp == False:
-            spark = SparkSession.builder.appName("vevesta").getOrCreate()
-            columnNames = []
-            columnNames = value.columns
-            for i in range(len(value.columns)):
-                value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType()))
-            vectorCol = "corrfeatures"
-            assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol)
-            df_vector = assembler.transform(value).select(vectorCol)
-            matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0]
-            corrMatrix = matrix.toArray().tolist()
-            dfCorr = spark.createDataFrame(corrMatrix, columnNames)
-            self.__correlation = dfCorr
+
 
     @property
     def ds(self):  # its doing the same work as dataSourcing do
@@ -282,24 +260,24 @@ def endModelling(self):
     # create alias of method modellingStart and modellingEnd
     start = startModelling
     end = endModelling
-    @property
-    def Y(self):
-        return self.__Y
+    # @property
+    # def Y(self):
+    #     return self.__Y
             
-    @Y.setter
-    def Y(self,value):
-        if isinstance(value,Series):
-            if value.size==self.__sampleSize:
-                self.__Y=value
-                self.____YcolumnName=None
-            else:
-                print('Panda series size not matching with the dataframe size')
-        elif isinstance(value,str):
-            if value in self.__data.columns:
-                self.__Y=self.__data[value]
-                self.____YcolumnName=value
-            else:
-                print("Column not found")
+    # @Y.setter
+    # def Y(self,value):
+    #     if isinstance(value,Series):
+    #         if value.size==self.__sampleSize:
+    #             self.__Y=value
+    #             self.____YcolumnName=None
+    #         else:
+    #             print('Panda series size not matching with the dataframe size')
+    #     elif isinstance(value,str):
+    #         if value in self.__data.columns:
+    #             self.__Y=self.__data[value]
+    #             self.____YcolumnName=value
+    #         else:
+    #             print("Column not found")
 
     # function to get arguments of a function
     def param(self, **decoratorparam):
@@ -419,27 +397,25 @@ def __textColor(self, val):
             color = 'white'
         return 'color: %s' % color
 
-    def __profilingReport(self, fileName):
+    def __profilingReport(self, df, writer):
         sheetName = 'Profiling Report'
         # if self.speedUp:
             # return
-        if not isinstance(self.__data, pandasDataframe):
+        if not isinstance(df, pandasDataframe):
             return
-        if self.__data.empty or len(self.__data) == 0:
+        if df.empty or len(df) == 0:
             return
-        if fileName is None:
-            return print("Error: Provide the Excel File")
 
         data = [
-            {'Number_of_observation': self.__data.shape[0],
-             'Number_of_variables': self.__data.shape[1],
-             'Missing_cells': self.__data.isna().sum().sum(),
-             'Missing_cells(%)': (self.__data.isnull().sum().sum() * 100) / (
-                     self.__data.notnull().sum().sum() + self.__data.isnull().sum().sum()),
-             'Duplicate_rows': self.__data.duplicated().sum(),
-             'Duplicate_rows(%)': (self.__data.duplicated().sum() * 100) / len(self.__data),
-             'Total_size_in_memory(byte)': self.__data.memory_usage().sum(),
-             'Average_record_size_in_memory(byte)': self.__data.memory_usage().sum() / len(self.__data)
+            {'Number_of_observation': df.shape[0],
+             'Number_of_variables': df.shape[1],
+             'Missing_cells': df.isna().sum().sum(),
+             'Missing_cells(%)': (df.isnull().sum().sum() * 100) / (
+                     df.notnull().sum().sum() + df.isnull().sum().sum()),
+             'Duplicate_rows': df.duplicated().sum(),
+             'Duplicate_rows(%)': (df.duplicated().sum() * 100) / len(df),
+             'Total_size_in_memory(byte)': df.memory_usage().sum(),
+             'Average_record_size_in_memory(byte)': df.memory_usage().sum() / len(df)
              }
         ]
         profilingDataframe = pandasDataframe(data)
@@ -450,45 +426,45 @@ def __profilingReport(self, fileName):
                             "Skewness", "Median", "Mode", "Outliers", "Outliers (%)", "Q1 quantile", "Q2 quantile",
                             "Q3 quantile", "100th quantile", "Total Memory Size(bytes)"]})
 
-        numericColumns = self.__data.select_dtypes(include=["number"]).columns
+        numericColumns = df.select_dtypes(include=["number"]).columns
         for col in numericColumns:
             # finding outliers for each column
-            Q1 = quantile(self.__data[col], 0.25)
-            Q3 = quantile(self.__data[col], 0.75)
+            Q1 = quantile(df[col], 0.25)
+            Q3 = quantile(df[col], 0.75)
             IQR = Q3 - Q1
-            outlier = ((self.__data[col] < (Q1 - 1.5 * IQR)) | (self.__data[col] > (Q3 + 1.5 * IQR))).sum()
-            col_dict = {"Distinct": self.__data[col].nunique(),
-                        "Distinct (%)": self.__data[col].nunique() * 100 / self.__data.shape[0],
-                        "Missing": self.__data[col].isna().sum(),
-                        "Missing (%)": (self.__data[col].isnull().sum() * 100) / (self.__data.shape[0]),
-                        "Infinite": isinf(self.__data[col]).values.sum(),
-                        "Infinite (%)": isinf(self.__data[col]).values.sum() * 100 / (self.__data.shape[0]),
-                        "Mean": self.__data[col].mean(),
-                        "Minimum": self.__data[col].min(),
-                        "Maximum": self.__data[col].max(),
-                        "Zeros": (self.__data[col] == 0).sum(),
-                        "Zeros (%)": (self.__data[col] == 0).sum() * 100 / self.__data.shape[0],
-                        "Negative": (self.__data[col] < 0).sum(),
-                        "Negative (%)": (self.__data[col] < 0).sum() * 100 / self.__data.shape[0],
-                        "Kurtosis": kurtosis(self.__data[col], axis=0, bias=True),
-                        "Skewness": skew(self.__data[col], axis=0, bias=True),
-                        "Median": median(self.__data[col]),
-                        "Mode": mode(self.__data[col]),
+            outlier = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
+            col_dict = {"Distinct": df[col].nunique(),
+                        "Distinct (%)": df[col].nunique() * 100 / df.shape[0],
+                        "Missing": df[col].isna().sum(),
+                        "Missing (%)": (df[col].isnull().sum() * 100) / (df.shape[0]),
+                        "Infinite": isinf(df[col]).values.sum(),
+                        "Infinite (%)": isinf(df[col]).values.sum() * 100 / (df.shape[0]),
+                        "Mean": df[col].mean(),
+                        "Minimum": df[col].min(),
+                        "Maximum": df[col].max(),
+                        "Zeros": (df[col] == 0).sum(),
+                        "Zeros (%)": (df[col] == 0).sum() * 100 / df.shape[0],
+                        "Negative": (df[col] < 0).sum(),
+                        "Negative (%)": (df[col] < 0).sum() * 100 / df.shape[0],
+                        "Kurtosis": kurtosis(df[col], axis=0, bias=True),
+                        "Skewness": skew(df[col], axis=0, bias=True),
+                        "Median": median(df[col]),
+                        "Mode": mode(df[col]),
                         "Outliers": outlier,
-                        "Outliers (%)": outlier * 100 / self.__data.shape[0],
-                        "Q1 quantile": quantile(self.__data[col], 0.25),
-                        "Q2 quantile": quantile(self.__data[col], 0.5),
-                        "Q3 quantile": quantile(self.__data[col], 0.75),
-                        "100th quantile": quantile(self.__data[col], 1),
-                        "Total Memory Size(bytes)": self.__data[col].memory_usage()}
+                        "Outliers (%)": outlier * 100 / df.shape[0],
+                        "Q1 quantile": quantile(df[col], 0.25),
+                        "Q2 quantile": quantile(df[col], 0.5),
+                        "Q3 quantile": quantile(df[col], 0.75),
+                        "100th quantile": quantile(df[col], 1),
+                        "Total Memory Size(bytes)": df[col].memory_usage()}
             profileOfVariableDataframe[col] = col_dict.values()
 
-        nonNumericalColumns = self.__data.select_dtypes(exclude=["number", "datetime"]).columns
+        nonNumericalColumns = df.select_dtypes(exclude=["number", "datetime"]).columns
         for col in nonNumericalColumns:
-            col_dict = {"Distinct": self.__data[col].nunique(),
-                        "Distinct (%)": self.__data[col].nunique() * 100 / self.__data.shape[0],
-                        "Missing": self.__data[col].isna().sum(),
-                        "Missing (%)": (self.__data[col].isnull().sum() * 100) / (self.__data.shape[0]),
+            col_dict = {"Distinct": df[col].nunique(),
+                        "Distinct (%)": df[col].nunique() * 100 / df.shape[0],
+                        "Missing": df[col].isna().sum(),
+                        "Missing (%)": (df[col].isnull().sum() * 100) / (df.shape[0]),
                         "Infinite": "NA",
                         "Infinite (%)": "NA",
                         "Mean": "NA",
@@ -508,13 +484,12 @@ def __profilingReport(self, fileName):
                         "Q2 quantile": "NA",
                         "Q3 quantile": "NA",
                         "100th quantile": "NA",
-                        "Total Memory Size(bytes)": self.__data[col].memory_usage()}
+                        "Total Memory Size(bytes)": df[col].memory_usage()}
             profileOfVariableDataframe[col] = col_dict.values()
 
-        if isfile(fileName):
-            with ExcelWriter(fileName, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
-                profilingDataframe.to_excel(writer, sheet_name="Profiling Report", index=False)
-                profileOfVariableDataframe.to_excel(writer, sheet_name="Variables Data Profile", index=False)
+        
+        profilingDataframe.to_excel(writer, sheet_name="Profiling Report", index=False)
+        profileOfVariableDataframe.to_excel(writer, sheet_name="Variables Data Profile", index=False)
 
     def dump(self, techniqueUsed, filename=None, message=None, version=None, showMessage=True, repoName=None):
 
@@ -528,10 +503,6 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes
 
         if (filename == None):
             filename = "vevesta.xlsx"
-            # pdfFilename = "vevesta.pdf"
-        # else:
-        #     pdfFilename=filename.split('.')
-        #     pdfFilename=pdfFilename[0]+'.pdf'
 
         # updating variables
         # when no V.start & v.end are not called, all variables in the code get tracked or in colab/kaggle where all variables will get tracked
@@ -607,18 +578,6 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes
         df_messages = pandasDataframe(index=[1], data=data)
         df_messages = concat([messageData, df_messages], ignore_index=True)
 
-        self.__sampleSize = 100 if self.__sampleSize >= 100 else self.__sampleSize
-
-        if isinstance(self.__data, pandasDataframe):
-            sampledData = self.__data.sample(self.__sampleSize)
-
-        if isinstance(self.__data, pysparkDataframe):
-            if self.__data.count() >= 100:
-                sampledData = self.__data.sample(100 / self.__data.count())
-
-            if self.__data.count() < 100:
-                sampledData = self.__data.sample(1.0)
-
         with ExcelWriter(filename, engine='openpyxl') as writer:
 
             df_dataSourcing.to_excel(writer, sheet_name='dataSourcing', index=False)
@@ -628,31 +587,6 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes
 
             df_messages.to_excel(writer, sheet_name='messages', index=False)
 
-            if isinstance(self.__data, pandasDataframe):
-                pandasDataframe(sampledData).to_excel(writer, sheet_name='sampledata', index=False)
-
-            if isinstance(self.__data, pysparkDataframe):
-                sampledData.toPandas().to_excel(writer, sheet_name='sampledata', index=False)
-
-            # if self.speedUp == False:
-            if self.__correlation is not None:
-                if isinstance(sampledData, pandasDataframe):
-                    pandasDataframe(self.__correlation).style. \
-                        applymap(self.__colorCellExcel). \
-                        applymap(self.__textColor). \
-                        to_excel(writer, sheet_name='EDA-correlation', index=True)
-
-                if isinstance(sampledData, pysparkDataframe):
-                    correlation = self.__correlation.toPandas()
-                    correlation.set_index(correlation.columns, inplace=True)
-                    pandasDataframe(correlation).style. \
-                        applymap(self.__colorCellExcel). \
-                        applymap(self.__textColor). \
-                        to_excel(writer, sheet_name='EDA-correlation', index=True)
-
-        self.__profilingReport(filename)
-
-
         self.__plot(filename)
 
         print("Dumped the experiment in the file " + filename)
@@ -702,11 +636,72 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes
             except Exception as e:
                 print('File not pushed to git')
 
-    def EDA(self, df, fileName):
+    def __writeEDACorrelationData(self, value, sampledData, writer):
+        if isinstance(value, pandasDataframe):
+            correlation = value.corr(method='pearson')
+        if isinstance(value, pysparkDataframe):
+            spark = SparkSession.builder.appName("vevesta").getOrCreate()
+            columnNames = []
+            columnNames = value.columns
+            for i in range(len(value.columns)):
+                value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType()))
+            vectorCol = "corrfeatures"
+            assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol)
+            df_vector = assembler.transform(value).select(vectorCol)
+            matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0]
+            corrMatrix = matrix.toArray().tolist()
+            dfCorr = spark.createDataFrame(corrMatrix, columnNames)
+            correlation = dfCorr
+
+        if correlation is not None:
+            if isinstance(sampledData, pandasDataframe):
+                pandasDataframe(correlation).style. \
+                        applymap(self.__colorCellExcel). \
+                        applymap(self.__textColor). \
+                        to_excel(writer, sheet_name='EDA-correlation', index=True)
+
+            if isinstance(sampledData, pysparkDataframe):
+                correlationPandas = correlation.toPandas()
+                correlationPandas.set_index(correlationPandas.columns, inplace=True)
+                pandasDataframe(correlationPandas).style. \
+                        applymap(self.__colorCellExcel). \
+                        applymap(self.__textColor). \
+                        to_excel(writer, sheet_name='EDA-correlation', index=True)
+
+    
+    def __writeSampledData(self, df, writer):
+
+        if isinstance(df, pandasDataframe):
+            sampleSize = df.shape[0]
+        
+        if isinstance(df, pysparkDataframe):
+            sampleSize = df.count()
+
+        sampleSize = 100 if sampleSize >= 100 else sampleSize
+
         if isinstance(df, pandasDataframe):
-            self.__EDAForPandas(df, fileName)
+            sampledData = df.sample(sampleSize)
 
-    def __EDAForPandas(self, df, xlsxFilename):
+        if isinstance(df, pysparkDataframe):
+            if df.count() >= 100:
+                sampledData = df.sample(100 / df.count())
+
+            if df.count() < 100:
+                sampledData = df.sample(1.0)
+
+        if isinstance(df, pandasDataframe):
+            pandasDataframe(sampledData).to_excel(writer, sheet_name='sampledata', index=False)
+
+        if isinstance(df, pysparkDataframe):
+            sampledData.toPandas().to_excel(writer, sheet_name='sampledata', index=False)
+
+        return sampledData
+    
+    def EDA(self, data, Y, fileName):
+        if isinstance(data, pandasDataframe):
+            self.__EDAForPandas(data, Y, fileName)
+
+    def __EDAForPandas(self, df, Y, xlsxFilename):
 
         if df.empty or len(df) == 0:
             return
@@ -741,7 +736,7 @@ def __EDAForPandas(self, df, xlsxFilename):
 
         # EDA missing values
         plt.figure(figsize=(6, 6))
-        plt.imshow(self.__data.isna(), aspect="auto", interpolation="nearest", cmap="coolwarm", extent=[0, 7, 0, 7])
+        plt.imshow(df.isna(), aspect="auto", interpolation="nearest", cmap="coolwarm", extent=[0, 7, 0, 7])
         plt.title("Sample Number vs Column Number")
         plt.xlabel("Column Number")
         plt.ylabel("Sample Number")
@@ -749,7 +744,7 @@ def __EDAForPandas(self, df, xlsxFilename):
         plt.close()
 
         # eda numeric feature distribution
-        RatioData = self.__data.isna().mean().sort_values()
+        RatioData = df.isna().mean().sort_values()
         xAxis = list(RatioData.index)
         yAxis = list(RatioData)
         plt.figure(figsize=(6, 6))
@@ -762,7 +757,7 @@ def __EDAForPandas(self, df, xlsxFilename):
         plt.close()
 
         # eda non numeric feature distribution
-        self.__data.plot(lw=0, marker="x", subplots=True, layout=(-1, 4), figsize=(10, 10), markersize=5,
+        df.plot(lw=0, marker="x", subplots=True, layout=(-1, 4), figsize=(10, 10), markersize=5,
                          title="Numeric feature Distribution(with X-axis representing the position in the file)").flatten()
         plt.tight_layout()
         plt.savefig(join(directoryToDumpData, NumericalFeatureDistributionImageFile), bbox_inches='tight',
@@ -770,13 +765,13 @@ def __EDAForPandas(self, df, xlsxFilename):
         plt.close()
 
         # EDA for outliers
-        numericColumns = self.__data.select_dtypes(include=["number"])
+        numericColumns = df.select_dtypes(include=["number"])
         red_circle = dict(markerfacecolor='red', marker='o', markeredgecolor='white')
         fig, axs = plt.subplots(2, len(numericColumns.columns)//2, figsize=(10, 10))
         fig.suptitle('Outliers',fontsize=20)
         for i, ax in enumerate(axs.flat):
             ax.boxplot(numericColumns.iloc[:, i], flierprops=red_circle)
-            ax.set_title(self.__data.columns[i], fontsize=15)
+            ax.set_title(df.columns[i], fontsize=15)
             #ax.tick_params(axis='both', labelrotation=45)
             plt.subplots_adjust(wspace=2)
             plt.savefig(join(directoryToDumpData, OutliersImageFile), bbox_inches='tight', dpi=100)
@@ -801,7 +796,7 @@ def __EDAForPandas(self, df, xlsxFilename):
         plt.close()"""
 
         # Identify non-numerical features
-        nonNumericalColumns = self.__data.select_dtypes(exclude=["number", "datetime"])
+        nonNumericalColumns = df.select_dtypes(exclude=["number", "datetime"])
         if len(nonNumericalColumns.columns) != 0:
             fig = plt.figure(figsize=(7, 7))
             k = 1
@@ -814,7 +809,7 @@ def __EDAForPandas(self, df, xlsxFilename):
             plt.close()
 
         # feature distribution
-        fig = self.__data.hist(bins=len(self.__data), figsize=(30, 25), layout=(-1, 3), edgecolor="black",
+        fig = df.hist(bins=len(df), figsize=(30, 25), layout=(-1, 3), edgecolor="black",
                                xlabelsize=15, ylabelsize=15)
         [x.title.set_size(15) for x in fig.ravel()]
         [x.tick_params(axis='x', labelrotation=90) for x in fig.ravel()]
@@ -824,19 +819,19 @@ def __EDAForPandas(self, df, xlsxFilename):
         plt.close()
         
         #Probability Density Function
-        numericDataframe = self.__data.select_dtypes(include='number')  
-        if self.__Y is not None and (self.__Y.dtype=='int64' or self.__Y.dtype=='object') and self.__Y.dtype!='float64':
-            k=1
-            fig = plt.figure(figsize=(20,15))
-            for i in numericDataframe:
-                if i!=self.____YcolumnName:
+        numericDataframe = df.select_dtypes(include='number')
+        if Y is not None and (Y.dtype=='int64' or Y.dtype=='object') and Y.dtype!='float64':
+            if len(Y) == numericDataframe.shape[0]:
+                k=1
+                fig = plt.figure(figsize=(20,15))
+                for i in numericDataframe:
                     ax = fig.add_subplot(4,(len(numericDataframe.columns)//4)+1, k)
-                    frequency=self.__Y.value_counts().keys().tolist()[0:10]
-                    y=self.__Y[self.__Y.isin(frequency)]
+                    frequency=Y.value_counts().keys().tolist()[0:10]
+                    y=Y[self.__Y.isin(frequency)]
                     sns.kdeplot(x=numericDataframe[i],hue=y, ax = ax,fill=True)
                     k+=1
-            plt.savefig(join(directoryToDumpData, ProbabilityDensityFunction), bbox_inches='tight', dpi=100)
-            plt.close()
+                plt.savefig(join(directoryToDumpData, ProbabilityDensityFunction), bbox_inches='tight', dpi=100)
+                plt.close()
         
             
 
@@ -852,10 +847,10 @@ def __EDAForPandas(self, df, xlsxFilename):
         with open(pdfFilename,"wb") as f:          
             f.write(convert(file,layout_fun=layout_function))
 
-        self.__writeEDADetailsToExcel(xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction)
+        self.__writeEDADetailsToExcel(df, xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction)
 
     
-    def __writeEDADetailsToExcel(self, xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction):
+    def __writeEDADetailsToExcel(self, df, xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction):
         # if ((xlsxFileName)):
             
             writer = ExcelWriter(xlsxFilename, engine='openpyxl')
@@ -901,7 +896,7 @@ def __writeEDADetailsToExcel(self, xlsxFilename, columnTextImgone, columnTextImg
             ThreeDplotsheet.add_image(ThreeDImg)"""
 
             # adding non-numeric column
-            nonNumericalColumns = self.__data.select_dtypes(exclude=["number", "datetime"])
+            nonNumericalColumns = df.select_dtypes(exclude=["number", "datetime"])
             if len(nonNumericalColumns.columns) != 0 and exists(
                     join(directoryToDumpData, NonNumericFeaturesImgFile)):
                 workBook.create_sheet('EDA-NonNumericFeatures')
@@ -931,6 +926,9 @@ def __writeEDADetailsToExcel(self, xlsxFilename, columnTextImgone, columnTextImg
                 pdfImage.anchor = columnTextImgone
                 pdfPlotsheet.add_image(pdfImage)   
             
+            sampledData = self.__writeSampledData(df, writer)
+            self.__writeEDACorrelationData(df, sampledData, writer)
+            self.__profilingReport(df, writer)
             writer.save()
 
     def __plot(self, fileName):

From 7b3ed27a3b44eade3899cc560e90e76ae5a31582 Mon Sep 17 00:00:00 2001
From: Pavan <pavan.bhatia578@gmail.com>
Date: Fri, 2 Sep 2022 00:47:27 +0530
Subject: [PATCH 3/4] eda changes

---
 vevestaX/vevesta.py | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/vevestaX/vevesta.py b/vevestaX/vevesta.py
index a14ccf9..152a67e 100644
--- a/vevestaX/vevesta.py
+++ b/vevestaX/vevesta.py
@@ -260,24 +260,24 @@ def endModelling(self):
     # create alias of method modellingStart and modellingEnd
     start = startModelling
     end = endModelling
-    # @property
-    # def Y(self):
-    #     return self.__Y
+    @property
+    def Y(self):
+        return self.__Y
             
-    # @Y.setter
-    # def Y(self,value):
-    #     if isinstance(value,Series):
-    #         if value.size==self.__sampleSize:
-    #             self.__Y=value
-    #             self.____YcolumnName=None
-    #         else:
-    #             print('Panda series size not matching with the dataframe size')
-    #     elif isinstance(value,str):
-    #         if value in self.__data.columns:
-    #             self.__Y=self.__data[value]
-    #             self.____YcolumnName=value
-    #         else:
-    #             print("Column not found")
+    @Y.setter
+    def Y(self,value):
+        if isinstance(value,Series):
+            if value.size==self.__sampleSize:
+                self.__Y=value
+                self.____YcolumnName=None
+            else:
+                print('Panda series size not matching with the dataframe size')
+        elif isinstance(value,str):
+            if value in self.__data.columns:
+                self.__Y=self.__data[value]
+                self.____YcolumnName=value
+            else:
+                print("Column not found")
 
     # function to get arguments of a function
     def param(self, **decoratorparam):
@@ -503,6 +503,10 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes
 
         if (filename == None):
             filename = "vevesta.xlsx"
+            # pdfFilename = "vevesta.pdf"
+        # else:
+        #     pdfFilename=filename.split('.')
+        #     pdfFilename=pdfFilename[0]+'.pdf'
 
         # updating variables
         # when no V.start & v.end are not called, all variables in the code get tracked or in colab/kaggle where all variables will get tracked
@@ -820,16 +824,17 @@ def __EDAForPandas(self, df, Y, xlsxFilename):
         
         #Probability Density Function
         numericDataframe = df.select_dtypes(include='number')
-        if Y is not None and (Y.dtype=='int64' or Y.dtype=='object') and Y.dtype!='float64':
+        if Y is not None and (Y.dtype=='int32' or Y.dtype=='int64' or Y.dtype=='object') and Y.dtype!='float64':
             if len(Y) == numericDataframe.shape[0]:
                 k=1
                 fig = plt.figure(figsize=(20,15))
                 for i in numericDataframe:
                     ax = fig.add_subplot(4,(len(numericDataframe.columns)//4)+1, k)
                     frequency=Y.value_counts().keys().tolist()[0:10]
-                    y=Y[self.__Y.isin(frequency)]
+                    y=Y[Y.isin(frequency)]
                     sns.kdeplot(x=numericDataframe[i],hue=y, ax = ax,fill=True)
                     k+=1
+                plt.suptitle('Probability Density Function',fontsize=20)
                 plt.savefig(join(directoryToDumpData, ProbabilityDensityFunction), bbox_inches='tight', dpi=100)
                 plt.close()
         
@@ -918,7 +923,7 @@ def __writeEDADetailsToExcel(self, df, xlsxFilename, columnTextImgone, columnTex
                 featureDistribution.add_image(featureDistributionImage)
                 
             if exists(join(directoryToDumpData, ProbabilityDensityFunction)):
-                workBookName = 'EDA-PDF'
+                workBookName = 'EDA-ProbabilityDensityFunction'
                 workBook.create_sheet(workBookName)
                 pdfPlotsheet = workBook[workBookName]
                 pdfImage = Image(

From e74b916ba881d19eaca2e100e68f23a22a0269c1 Mon Sep 17 00:00:00 2001
From: Pavan <pavan.bhatia578@gmail.com>
Date: Fri, 2 Sep 2022 15:31:45 +0530
Subject: [PATCH 4/4] hardcoding EDA filename

---
 vevestaX/vevesta.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vevestaX/vevesta.py b/vevestaX/vevesta.py
index 152a67e..eb5576d 100644
--- a/vevestaX/vevesta.py
+++ b/vevestaX/vevesta.py
@@ -701,9 +701,9 @@ def __writeSampledData(self, df, writer):
 
         return sampledData
     
-    def EDA(self, data, Y, fileName):
+    def EDA(self, data, Y):
         if isinstance(data, pandasDataframe):
-            self.__EDAForPandas(data, Y, fileName)
+            self.__EDAForPandas(data, Y, "EDA_Vevesta.xlsx")
 
     def __EDAForPandas(self, df, Y, xlsxFilename):
 
@@ -721,7 +721,7 @@ def __EDAForPandas(self, df, Y, xlsxFilename):
             pdfFilename="EDA_Vevesta.pdf"
         else:
             # xlsxFilename = filename
-            pdfFilename=filename.split('.')
+            pdfFilename=xlsxFilename.split('.')
             pdfFilename=pdfFilename[0]+'.pdf'
 
         columnTextImgone = 'B2'
@@ -1030,11 +1030,11 @@ def __getExcelSheetData(self, fileName, sheetName):
     def commit(self, techniqueUsed, filename=None, message=None, version=None, projectId=None,
                repoName=None, branch=None):
         self.dump(techniqueUsed, filename=filename, message=message, version=version, showMessage=False, repoName=None)
-        if filename is None:
-            pdfFilename="EDA_Vevesta.pdf"
-        else:
-            pdfFilename=filename.split('.')
-            pdfFilename=pdfFilename[0]+'.pdf'
+        # if filename is None:
+        pdfFilename="EDA_Vevesta.pdf"
+        # else:
+        #     pdfFilename=filename.split('.')
+        #     pdfFilename=pdfFilename[0]+'.pdf'
         # api-endpoint
         token = self.__find_access_token()
         backend_url = 'https://api.matrixkanban.com/services-1.0-SNAPSHOT'