From 67982ef7255dd8d415f3aa8de50d31b054914bf4 Mon Sep 17 00:00:00 2001 From: Pavan Date: Wed, 31 Aug 2022 20:16:49 +0530 Subject: [PATCH 1/4] eda changes --- vevestaX/__init__.py | 2 +- vevestaX/vevesta.py | 156 ++++++++++++++++++++++--------------------- 2 files changed, 80 insertions(+), 78 deletions(-) diff --git a/vevestaX/__init__.py b/vevestaX/__init__.py index ff52e2e..c5a38a4 100644 --- a/vevestaX/__init__.py +++ b/vevestaX/__init__.py @@ -1 +1 @@ -__version__ = '6.8.0' +__version__ = '6.8.1' diff --git a/vevestaX/vevesta.py b/vevestaX/vevesta.py index 01eab14..a8ca0f1 100644 --- a/vevestaX/vevesta.py +++ b/vevestaX/vevesta.py @@ -36,7 +36,7 @@ def test(): class Experiment(object): - def __init__(self, speedUp=True): + def __init__(self, speedUp=False): self.__dataSourcing = None self.__featureEngineering = None self.__data = None @@ -48,7 +48,7 @@ def __init__(self, speedUp=True): self.__filename = self.get_filename() self.__sampleSize = 0 self.__Y=None - self.speedUp = speedUp + # self.speedUp = speedUp def get_filename(self): try: @@ -179,26 +179,26 @@ def dataSourcing(self, value): self.__dataSourcing = value.columns.tolist() self.__data = value self.__sampleSize = len(value) - if self.speedUp == False: - self.__correlation = value.corr(method='pearson') + # if self.speedUp == False: + self.__correlation = value.corr(method='pearson') if isinstance(value, pysparkDataframe): self.__dataSourcing = value.columns self.__data = value self.__sampleSize = value.count() - if self.speedUp == False: - spark = SparkSession.builder.appName("vevesta").getOrCreate() - columnNames = [] - columnNames = value.columns - for i in range(len(value.columns)): - value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType())) - vectorCol = "corrfeatures" - assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol) - df_vector = assembler.transform(value).select(vectorCol) - matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0] - corrMatrix = matrix.toArray().tolist() - dfCorr = spark.createDataFrame(corrMatrix, columnNames) - self.__correlation = dfCorr + # if self.speedUp == False: + spark = SparkSession.builder.appName("vevesta").getOrCreate() + columnNames = [] + columnNames = value.columns + for i in range(len(value.columns)): + value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType())) + vectorCol = "corrfeatures" + assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol) + df_vector = assembler.transform(value).select(vectorCol) + matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0] + corrMatrix = matrix.toArray().tolist() + dfCorr = spark.createDataFrame(corrMatrix, columnNames) + self.__correlation = dfCorr @property def ds(self): # its doing the same work as dataSourcing do @@ -238,23 +238,23 @@ def featureEngineering(self, value): self.__featureEngineering = cols if isinstance(value, pandasDataframe): - if self.speedUp == False: - self.__correlation = value.corr(method='pearson') + # if self.speedUp == False: + self.__correlation = value.corr(method='pearson') if isinstance(value, pysparkDataframe): - if self.speedUp == False: - spark = SparkSession.builder.appName("vevesta").getOrCreate() - columnNames = [] - columnNames = value.columns - for i in range(len(value.columns)): - value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType())) - vectorCol = "corrfeatures" - assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol) - df_vector = assembler.transform(value).select(vectorCol) - matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0] - corrMatrix = matrix.toArray().tolist() - dfCorr = spark.createDataFrame(corrMatrix, columnNames) - self.__correlation = dfCorr + # if self.speedUp == False: + spark = SparkSession.builder.appName("vevesta").getOrCreate() + columnNames = [] + columnNames = value.columns + for i in range(len(value.columns)): + value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType())) + vectorCol = "corrfeatures" + assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol) + df_vector = assembler.transform(value).select(vectorCol) + matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0] + corrMatrix = matrix.toArray().tolist() + dfCorr = spark.createDataFrame(corrMatrix, columnNames) + self.__correlation = dfCorr @property def fe(self): @@ -421,8 +421,8 @@ def __textColor(self, val): def __profilingReport(self, fileName): sheetName = 'Profiling Report' - if self.speedUp: - return + # if self.speedUp: + # return if not isinstance(self.__data, pandasDataframe): return if self.__data.empty or len(self.__data) == 0: @@ -528,10 +528,10 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes if (filename == None): filename = "vevesta.xlsx" - pdfFilename = "vevesta.pdf" - else: - pdfFilename=filename.split('.') - pdfFilename=pdfFilename[0]+'.pdf' + # pdfFilename = "vevesta.pdf" + # else: + # pdfFilename=filename.split('.') + # pdfFilename=pdfFilename[0]+'.pdf' # updating variables # when no V.start & v.end are not called, all variables in the code get tracked or in colab/kaggle where all variables will get tracked @@ -634,26 +634,24 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes if isinstance(self.__data, pysparkDataframe): sampledData.toPandas().to_excel(writer, sheet_name='sampledata', index=False) - if self.speedUp == False: - if self.__correlation is not None: - if isinstance(sampledData, pandasDataframe): - pandasDataframe(self.__correlation).style. \ - applymap(self.__colorCellExcel). \ - applymap(self.__textColor). \ - to_excel(writer, sheet_name='EDA-correlation', index=True) - - if isinstance(sampledData, pysparkDataframe): - correlation = self.__correlation.toPandas() - correlation.set_index(correlation.columns, inplace=True) - pandasDataframe(correlation).style. \ - applymap(self.__colorCellExcel). \ - applymap(self.__textColor). \ - to_excel(writer, sheet_name='EDA-correlation', index=True) + # if self.speedUp == False: + if self.__correlation is not None: + if isinstance(sampledData, pandasDataframe): + pandasDataframe(self.__correlation).style. \ + applymap(self.__colorCellExcel). \ + applymap(self.__textColor). \ + to_excel(writer, sheet_name='EDA-correlation', index=True) + + if isinstance(sampledData, pysparkDataframe): + correlation = self.__correlation.toPandas() + correlation.set_index(correlation.columns, inplace=True) + pandasDataframe(correlation).style. \ + applymap(self.__colorCellExcel). \ + applymap(self.__textColor). \ + to_excel(writer, sheet_name='EDA-correlation', index=True) self.__profilingReport(filename) - if self.speedUp == False: - self.__EDA(filename) self.__plot(filename) @@ -704,25 +702,27 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes except Exception as e: print('File not pushed to git') - def __EDA(self, fileName): - if isinstance(self.__data, pandasDataframe): - self.__EDAForPandas(fileName) + def EDA(self, df, fileName): + if isinstance(df, pandasDataframe): + self.__EDAForPandas(df, fileName) - def __EDAForPandas(self, fileName): + def __EDAForPandas(self, df, xlsxFilename): - if self.__data.empty or len(self.__data) == 0: + if df.empty or len(df) == 0: return - if not isinstance(self.__data, pandasDataframe): + if not isinstance(df, pandasDataframe): return - if (fileName == None): - return print("Error: Provide the Excel File to plot the models") + # if (xlsxFileName == None): + # return print("Error: Provide the Excel File to plot the models") - if fileName is None: - pdfFilename="vevesta.pdf" + if xlsxFilename is None: + xlsxFilename="EDA_Vevesta.xlsx" + pdfFilename="EDA_Vevesta.pdf" else: - pdfFilename=fileName.split('.') + # xlsxFilename = filename + pdfFilename=filename.split('.') pdfFilename=pdfFilename[0]+'.pdf' columnTextImgone = 'B2' @@ -852,10 +852,14 @@ def __EDAForPandas(self, fileName): with open(pdfFilename,"wb") as f: f.write(convert(file,layout_fun=layout_function)) - + self.__writeEDADetailsToExcel(xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction) - if (isfile(fileName)): - workBook = load_workbook(fileName) + + def __writeEDADetailsToExcel(self, xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction): + # if ((xlsxFileName)): + + writer = ExcelWriter(xlsxFilename, engine='openpyxl') + workBook = writer.book workBook.create_sheet('EDA-missingValues') plotSheet = workBook['EDA-missingValues'] img = Image(join(directoryToDumpData, ValueImageFile)) @@ -927,9 +931,7 @@ def __EDAForPandas(self, fileName): pdfImage.anchor = columnTextImgone pdfPlotsheet.add_image(pdfImage) - - workBook.save(fileName) - workBook.close() + writer.save() def __plot(self, fileName): @@ -1021,11 +1023,12 @@ def __getExcelSheetData(self, fileName, sheetName): modelingData = read_excel(fileName, sheet_name=sheetName, index_col=[]) return modelingData + def commit(self, techniqueUsed, filename=None, message=None, version=None, projectId=None, repoName=None, branch=None): self.dump(techniqueUsed, filename=filename, message=message, version=version, showMessage=False, repoName=None) if filename is None: - pdfFilename="vevesta.pdf" + pdfFilename="EDA_Vevesta.pdf" else: pdfFilename=filename.split('.') pdfFilename=pdfFilename[0]+'.pdf' @@ -1054,7 +1057,7 @@ def commit(self, techniqueUsed, filename=None, message=None, version=None, proje filename = self.get_filename() file_exists = exists(filename) - file1_exists=exists(pdfFilename) + eda_file_exists=exists(pdfFilename) if file_exists: files = {'file': open(filename, 'rb')} headers_for_file = {'Authorization': 'Bearer ' + token} @@ -1063,7 +1066,7 @@ def commit(self, techniqueUsed, filename=None, message=None, version=None, proje files=files) attachments = list() attachments.append(response.json()) - if file1_exists: + if eda_file_exists: files = {'file': open(pdfFilename, 'rb')} response = requests.post(url=backend_url + '/Attachments', headers=headers_for_file, params=params, files=files) @@ -1082,8 +1085,7 @@ def commit(self, techniqueUsed, filename=None, message=None, version=None, proje "message": message, "modeling": self.__variables, "dataSourced": self.__dataSourcing, - "featureEngineered": self.__featureEngineering, - "speedUp": self.speedUp + "featureEngineered": self.__featureEngineering } if file_exists: From 38b774c401234e22512e764086a711c70c13f875 Mon Sep 17 00:00:00 2001 From: Pavan Date: Thu, 1 Sep 2022 10:52:02 +0530 Subject: [PATCH 2/4] eda changes --- vevestaX/vevesta.py | 312 ++++++++++++++++++++++---------------------- 1 file changed, 155 insertions(+), 157 deletions(-) diff --git a/vevestaX/vevesta.py b/vevestaX/vevesta.py index a8ca0f1..a14ccf9 100644 --- a/vevestaX/vevesta.py +++ b/vevestaX/vevesta.py @@ -36,19 +36,15 @@ def test(): class Experiment(object): - def __init__(self, speedUp=False): + def __init__(self): self.__dataSourcing = None self.__featureEngineering = None - self.__data = None - self.__correlation = None - + self.__primitiveDataTypes = [int, str, float, bool] self.__startlocals = None self.__variables = {} self.__filename = self.get_filename() self.__sampleSize = 0 - self.__Y=None - # self.speedUp = speedUp def get_filename(self): try: @@ -177,28 +173,10 @@ def dataSourcing(self): def dataSourcing(self, value): if isinstance(value, pandasDataframe): self.__dataSourcing = value.columns.tolist() - self.__data = value - self.__sampleSize = len(value) - # if self.speedUp == False: - self.__correlation = value.corr(method='pearson') if isinstance(value, pysparkDataframe): self.__dataSourcing = value.columns - self.__data = value - self.__sampleSize = value.count() - # if self.speedUp == False: - spark = SparkSession.builder.appName("vevesta").getOrCreate() - columnNames = [] - columnNames = value.columns - for i in range(len(value.columns)): - value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType())) - vectorCol = "corrfeatures" - assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol) - df_vector = assembler.transform(value).select(vectorCol) - matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0] - corrMatrix = matrix.toArray().tolist() - dfCorr = spark.createDataFrame(corrMatrix, columnNames) - self.__correlation = dfCorr + @property def ds(self): # its doing the same work as dataSourcing do @@ -282,24 +260,24 @@ def endModelling(self): # create alias of method modellingStart and modellingEnd start = startModelling end = endModelling - @property - def Y(self): - return self.__Y + # @property + # def Y(self): + # return self.__Y - @Y.setter - def Y(self,value): - if isinstance(value,Series): - if value.size==self.__sampleSize: - self.__Y=value - self.____YcolumnName=None - else: - print('Panda series size not matching with the dataframe size') - elif isinstance(value,str): - if value in self.__data.columns: - self.__Y=self.__data[value] - self.____YcolumnName=value - else: - print("Column not found") + # @Y.setter + # def Y(self,value): + # if isinstance(value,Series): + # if value.size==self.__sampleSize: + # self.__Y=value + # self.____YcolumnName=None + # else: + # print('Panda series size not matching with the dataframe size') + # elif isinstance(value,str): + # if value in self.__data.columns: + # self.__Y=self.__data[value] + # self.____YcolumnName=value + # else: + # print("Column not found") # function to get arguments of a function def param(self, **decoratorparam): @@ -419,27 +397,25 @@ def __textColor(self, val): color = 'white' return 'color: %s' % color - def __profilingReport(self, fileName): + def __profilingReport(self, df, writer): sheetName = 'Profiling Report' # if self.speedUp: # return - if not isinstance(self.__data, pandasDataframe): + if not isinstance(df, pandasDataframe): return - if self.__data.empty or len(self.__data) == 0: + if df.empty or len(df) == 0: return - if fileName is None: - return print("Error: Provide the Excel File") data = [ - {'Number_of_observation': self.__data.shape[0], - 'Number_of_variables': self.__data.shape[1], - 'Missing_cells': self.__data.isna().sum().sum(), - 'Missing_cells(%)': (self.__data.isnull().sum().sum() * 100) / ( - self.__data.notnull().sum().sum() + self.__data.isnull().sum().sum()), - 'Duplicate_rows': self.__data.duplicated().sum(), - 'Duplicate_rows(%)': (self.__data.duplicated().sum() * 100) / len(self.__data), - 'Total_size_in_memory(byte)': self.__data.memory_usage().sum(), - 'Average_record_size_in_memory(byte)': self.__data.memory_usage().sum() / len(self.__data) + {'Number_of_observation': df.shape[0], + 'Number_of_variables': df.shape[1], + 'Missing_cells': df.isna().sum().sum(), + 'Missing_cells(%)': (df.isnull().sum().sum() * 100) / ( + df.notnull().sum().sum() + df.isnull().sum().sum()), + 'Duplicate_rows': df.duplicated().sum(), + 'Duplicate_rows(%)': (df.duplicated().sum() * 100) / len(df), + 'Total_size_in_memory(byte)': df.memory_usage().sum(), + 'Average_record_size_in_memory(byte)': df.memory_usage().sum() / len(df) } ] profilingDataframe = pandasDataframe(data) @@ -450,45 +426,45 @@ def __profilingReport(self, fileName): "Skewness", "Median", "Mode", "Outliers", "Outliers (%)", "Q1 quantile", "Q2 quantile", "Q3 quantile", "100th quantile", "Total Memory Size(bytes)"]}) - numericColumns = self.__data.select_dtypes(include=["number"]).columns + numericColumns = df.select_dtypes(include=["number"]).columns for col in numericColumns: # finding outliers for each column - Q1 = quantile(self.__data[col], 0.25) - Q3 = quantile(self.__data[col], 0.75) + Q1 = quantile(df[col], 0.25) + Q3 = quantile(df[col], 0.75) IQR = Q3 - Q1 - outlier = ((self.__data[col] < (Q1 - 1.5 * IQR)) | (self.__data[col] > (Q3 + 1.5 * IQR))).sum() - col_dict = {"Distinct": self.__data[col].nunique(), - "Distinct (%)": self.__data[col].nunique() * 100 / self.__data.shape[0], - "Missing": self.__data[col].isna().sum(), - "Missing (%)": (self.__data[col].isnull().sum() * 100) / (self.__data.shape[0]), - "Infinite": isinf(self.__data[col]).values.sum(), - "Infinite (%)": isinf(self.__data[col]).values.sum() * 100 / (self.__data.shape[0]), - "Mean": self.__data[col].mean(), - "Minimum": self.__data[col].min(), - "Maximum": self.__data[col].max(), - "Zeros": (self.__data[col] == 0).sum(), - "Zeros (%)": (self.__data[col] == 0).sum() * 100 / self.__data.shape[0], - "Negative": (self.__data[col] < 0).sum(), - "Negative (%)": (self.__data[col] < 0).sum() * 100 / self.__data.shape[0], - "Kurtosis": kurtosis(self.__data[col], axis=0, bias=True), - "Skewness": skew(self.__data[col], axis=0, bias=True), - "Median": median(self.__data[col]), - "Mode": mode(self.__data[col]), + outlier = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum() + col_dict = {"Distinct": df[col].nunique(), + "Distinct (%)": df[col].nunique() * 100 / df.shape[0], + "Missing": df[col].isna().sum(), + "Missing (%)": (df[col].isnull().sum() * 100) / (df.shape[0]), + "Infinite": isinf(df[col]).values.sum(), + "Infinite (%)": isinf(df[col]).values.sum() * 100 / (df.shape[0]), + "Mean": df[col].mean(), + "Minimum": df[col].min(), + "Maximum": df[col].max(), + "Zeros": (df[col] == 0).sum(), + "Zeros (%)": (df[col] == 0).sum() * 100 / df.shape[0], + "Negative": (df[col] < 0).sum(), + "Negative (%)": (df[col] < 0).sum() * 100 / df.shape[0], + "Kurtosis": kurtosis(df[col], axis=0, bias=True), + "Skewness": skew(df[col], axis=0, bias=True), + "Median": median(df[col]), + "Mode": mode(df[col]), "Outliers": outlier, - "Outliers (%)": outlier * 100 / self.__data.shape[0], - "Q1 quantile": quantile(self.__data[col], 0.25), - "Q2 quantile": quantile(self.__data[col], 0.5), - "Q3 quantile": quantile(self.__data[col], 0.75), - "100th quantile": quantile(self.__data[col], 1), - "Total Memory Size(bytes)": self.__data[col].memory_usage()} + "Outliers (%)": outlier * 100 / df.shape[0], + "Q1 quantile": quantile(df[col], 0.25), + "Q2 quantile": quantile(df[col], 0.5), + "Q3 quantile": quantile(df[col], 0.75), + "100th quantile": quantile(df[col], 1), + "Total Memory Size(bytes)": df[col].memory_usage()} profileOfVariableDataframe[col] = col_dict.values() - nonNumericalColumns = self.__data.select_dtypes(exclude=["number", "datetime"]).columns + nonNumericalColumns = df.select_dtypes(exclude=["number", "datetime"]).columns for col in nonNumericalColumns: - col_dict = {"Distinct": self.__data[col].nunique(), - "Distinct (%)": self.__data[col].nunique() * 100 / self.__data.shape[0], - "Missing": self.__data[col].isna().sum(), - "Missing (%)": (self.__data[col].isnull().sum() * 100) / (self.__data.shape[0]), + col_dict = {"Distinct": df[col].nunique(), + "Distinct (%)": df[col].nunique() * 100 / df.shape[0], + "Missing": df[col].isna().sum(), + "Missing (%)": (df[col].isnull().sum() * 100) / (df.shape[0]), "Infinite": "NA", "Infinite (%)": "NA", "Mean": "NA", @@ -508,13 +484,12 @@ def __profilingReport(self, fileName): "Q2 quantile": "NA", "Q3 quantile": "NA", "100th quantile": "NA", - "Total Memory Size(bytes)": self.__data[col].memory_usage()} + "Total Memory Size(bytes)": df[col].memory_usage()} profileOfVariableDataframe[col] = col_dict.values() - if isfile(fileName): - with ExcelWriter(fileName, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: - profilingDataframe.to_excel(writer, sheet_name="Profiling Report", index=False) - profileOfVariableDataframe.to_excel(writer, sheet_name="Variables Data Profile", index=False) + + profilingDataframe.to_excel(writer, sheet_name="Profiling Report", index=False) + profileOfVariableDataframe.to_excel(writer, sheet_name="Variables Data Profile", index=False) def dump(self, techniqueUsed, filename=None, message=None, version=None, showMessage=True, repoName=None): @@ -528,10 +503,6 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes if (filename == None): filename = "vevesta.xlsx" - # pdfFilename = "vevesta.pdf" - # else: - # pdfFilename=filename.split('.') - # pdfFilename=pdfFilename[0]+'.pdf' # updating variables # when no V.start & v.end are not called, all variables in the code get tracked or in colab/kaggle where all variables will get tracked @@ -607,18 +578,6 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes df_messages = pandasDataframe(index=[1], data=data) df_messages = concat([messageData, df_messages], ignore_index=True) - self.__sampleSize = 100 if self.__sampleSize >= 100 else self.__sampleSize - - if isinstance(self.__data, pandasDataframe): - sampledData = self.__data.sample(self.__sampleSize) - - if isinstance(self.__data, pysparkDataframe): - if self.__data.count() >= 100: - sampledData = self.__data.sample(100 / self.__data.count()) - - if self.__data.count() < 100: - sampledData = self.__data.sample(1.0) - with ExcelWriter(filename, engine='openpyxl') as writer: df_dataSourcing.to_excel(writer, sheet_name='dataSourcing', index=False) @@ -628,31 +587,6 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes df_messages.to_excel(writer, sheet_name='messages', index=False) - if isinstance(self.__data, pandasDataframe): - pandasDataframe(sampledData).to_excel(writer, sheet_name='sampledata', index=False) - - if isinstance(self.__data, pysparkDataframe): - sampledData.toPandas().to_excel(writer, sheet_name='sampledata', index=False) - - # if self.speedUp == False: - if self.__correlation is not None: - if isinstance(sampledData, pandasDataframe): - pandasDataframe(self.__correlation).style. \ - applymap(self.__colorCellExcel). \ - applymap(self.__textColor). \ - to_excel(writer, sheet_name='EDA-correlation', index=True) - - if isinstance(sampledData, pysparkDataframe): - correlation = self.__correlation.toPandas() - correlation.set_index(correlation.columns, inplace=True) - pandasDataframe(correlation).style. \ - applymap(self.__colorCellExcel). \ - applymap(self.__textColor). \ - to_excel(writer, sheet_name='EDA-correlation', index=True) - - self.__profilingReport(filename) - - self.__plot(filename) print("Dumped the experiment in the file " + filename) @@ -702,11 +636,72 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes except Exception as e: print('File not pushed to git') - def EDA(self, df, fileName): + def __writeEDACorrelationData(self, value, sampledData, writer): + if isinstance(value, pandasDataframe): + correlation = value.corr(method='pearson') + if isinstance(value, pysparkDataframe): + spark = SparkSession.builder.appName("vevesta").getOrCreate() + columnNames = [] + columnNames = value.columns + for i in range(len(value.columns)): + value = value.withColumn(columnNames[i], value[columnNames[i]].cast(DoubleType())) + vectorCol = "corrfeatures" + assembler = VectorAssembler(inputCols=value.columns, outputCol=vectorCol) + df_vector = assembler.transform(value).select(vectorCol) + matrix = Correlation.corr(df_vector, vectorCol).collect()[0][0] + corrMatrix = matrix.toArray().tolist() + dfCorr = spark.createDataFrame(corrMatrix, columnNames) + correlation = dfCorr + + if correlation is not None: + if isinstance(sampledData, pandasDataframe): + pandasDataframe(correlation).style. \ + applymap(self.__colorCellExcel). \ + applymap(self.__textColor). \ + to_excel(writer, sheet_name='EDA-correlation', index=True) + + if isinstance(sampledData, pysparkDataframe): + correlationPandas = correlation.toPandas() + correlationPandas.set_index(correlationPandas.columns, inplace=True) + pandasDataframe(correlationPandas).style. \ + applymap(self.__colorCellExcel). \ + applymap(self.__textColor). \ + to_excel(writer, sheet_name='EDA-correlation', index=True) + + + def __writeSampledData(self, df, writer): + + if isinstance(df, pandasDataframe): + sampleSize = df.shape[0] + + if isinstance(df, pysparkDataframe): + sampleSize = df.count() + + sampleSize = 100 if sampleSize >= 100 else sampleSize + if isinstance(df, pandasDataframe): - self.__EDAForPandas(df, fileName) + sampledData = df.sample(sampleSize) - def __EDAForPandas(self, df, xlsxFilename): + if isinstance(df, pysparkDataframe): + if df.count() >= 100: + sampledData = df.sample(100 / df.count()) + + if df.count() < 100: + sampledData = df.sample(1.0) + + if isinstance(df, pandasDataframe): + pandasDataframe(sampledData).to_excel(writer, sheet_name='sampledata', index=False) + + if isinstance(df, pysparkDataframe): + sampledData.toPandas().to_excel(writer, sheet_name='sampledata', index=False) + + return sampledData + + def EDA(self, data, Y, fileName): + if isinstance(data, pandasDataframe): + self.__EDAForPandas(data, Y, fileName) + + def __EDAForPandas(self, df, Y, xlsxFilename): if df.empty or len(df) == 0: return @@ -741,7 +736,7 @@ def __EDAForPandas(self, df, xlsxFilename): # EDA missing values plt.figure(figsize=(6, 6)) - plt.imshow(self.__data.isna(), aspect="auto", interpolation="nearest", cmap="coolwarm", extent=[0, 7, 0, 7]) + plt.imshow(df.isna(), aspect="auto", interpolation="nearest", cmap="coolwarm", extent=[0, 7, 0, 7]) plt.title("Sample Number vs Column Number") plt.xlabel("Column Number") plt.ylabel("Sample Number") @@ -749,7 +744,7 @@ def __EDAForPandas(self, df, xlsxFilename): plt.close() # eda numeric feature distribution - RatioData = self.__data.isna().mean().sort_values() + RatioData = df.isna().mean().sort_values() xAxis = list(RatioData.index) yAxis = list(RatioData) plt.figure(figsize=(6, 6)) @@ -762,7 +757,7 @@ def __EDAForPandas(self, df, xlsxFilename): plt.close() # eda non numeric feature distribution - self.__data.plot(lw=0, marker="x", subplots=True, layout=(-1, 4), figsize=(10, 10), markersize=5, + df.plot(lw=0, marker="x", subplots=True, layout=(-1, 4), figsize=(10, 10), markersize=5, title="Numeric feature Distribution(with X-axis representing the position in the file)").flatten() plt.tight_layout() plt.savefig(join(directoryToDumpData, NumericalFeatureDistributionImageFile), bbox_inches='tight', @@ -770,13 +765,13 @@ def __EDAForPandas(self, df, xlsxFilename): plt.close() # EDA for outliers - numericColumns = self.__data.select_dtypes(include=["number"]) + numericColumns = df.select_dtypes(include=["number"]) red_circle = dict(markerfacecolor='red', marker='o', markeredgecolor='white') fig, axs = plt.subplots(2, len(numericColumns.columns)//2, figsize=(10, 10)) fig.suptitle('Outliers',fontsize=20) for i, ax in enumerate(axs.flat): ax.boxplot(numericColumns.iloc[:, i], flierprops=red_circle) - ax.set_title(self.__data.columns[i], fontsize=15) + ax.set_title(df.columns[i], fontsize=15) #ax.tick_params(axis='both', labelrotation=45) plt.subplots_adjust(wspace=2) plt.savefig(join(directoryToDumpData, OutliersImageFile), bbox_inches='tight', dpi=100) @@ -801,7 +796,7 @@ def __EDAForPandas(self, df, xlsxFilename): plt.close()""" # Identify non-numerical features - nonNumericalColumns = self.__data.select_dtypes(exclude=["number", "datetime"]) + nonNumericalColumns = df.select_dtypes(exclude=["number", "datetime"]) if len(nonNumericalColumns.columns) != 0: fig = plt.figure(figsize=(7, 7)) k = 1 @@ -814,7 +809,7 @@ def __EDAForPandas(self, df, xlsxFilename): plt.close() # feature distribution - fig = self.__data.hist(bins=len(self.__data), figsize=(30, 25), layout=(-1, 3), edgecolor="black", + fig = df.hist(bins=len(df), figsize=(30, 25), layout=(-1, 3), edgecolor="black", xlabelsize=15, ylabelsize=15) [x.title.set_size(15) for x in fig.ravel()] [x.tick_params(axis='x', labelrotation=90) for x in fig.ravel()] @@ -824,19 +819,19 @@ def __EDAForPandas(self, df, xlsxFilename): plt.close() #Probability Density Function - numericDataframe = self.__data.select_dtypes(include='number') - if self.__Y is not None and (self.__Y.dtype=='int64' or self.__Y.dtype=='object') and self.__Y.dtype!='float64': - k=1 - fig = plt.figure(figsize=(20,15)) - for i in numericDataframe: - if i!=self.____YcolumnName: + numericDataframe = df.select_dtypes(include='number') + if Y is not None and (Y.dtype=='int64' or Y.dtype=='object') and Y.dtype!='float64': + if len(Y) == numericDataframe.shape[0]: + k=1 + fig = plt.figure(figsize=(20,15)) + for i in numericDataframe: ax = fig.add_subplot(4,(len(numericDataframe.columns)//4)+1, k) - frequency=self.__Y.value_counts().keys().tolist()[0:10] - y=self.__Y[self.__Y.isin(frequency)] + frequency=Y.value_counts().keys().tolist()[0:10] + y=Y[self.__Y.isin(frequency)] sns.kdeplot(x=numericDataframe[i],hue=y, ax = ax,fill=True) k+=1 - plt.savefig(join(directoryToDumpData, ProbabilityDensityFunction), bbox_inches='tight', dpi=100) - plt.close() + plt.savefig(join(directoryToDumpData, ProbabilityDensityFunction), bbox_inches='tight', dpi=100) + plt.close() @@ -852,10 +847,10 @@ def __EDAForPandas(self, df, xlsxFilename): with open(pdfFilename,"wb") as f: f.write(convert(file,layout_fun=layout_function)) - self.__writeEDADetailsToExcel(xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction) + self.__writeEDADetailsToExcel(df, xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction) - def __writeEDADetailsToExcel(self, xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction): + def __writeEDADetailsToExcel(self, df, xlsxFilename, columnTextImgone, columnTextImgtwo, directoryToDumpData, ValueImageFile, ValueRatioImageFile, NumericalFeatureDistributionImageFile, NonNumericFeaturesImgFile, FeatureHistogramImageFile, OutliersImageFile, ProbabilityDensityFunction): # if ((xlsxFileName)): writer = ExcelWriter(xlsxFilename, engine='openpyxl') @@ -901,7 +896,7 @@ def __writeEDADetailsToExcel(self, xlsxFilename, columnTextImgone, columnTextImg ThreeDplotsheet.add_image(ThreeDImg)""" # adding non-numeric column - nonNumericalColumns = self.__data.select_dtypes(exclude=["number", "datetime"]) + nonNumericalColumns = df.select_dtypes(exclude=["number", "datetime"]) if len(nonNumericalColumns.columns) != 0 and exists( join(directoryToDumpData, NonNumericFeaturesImgFile)): workBook.create_sheet('EDA-NonNumericFeatures') @@ -931,6 +926,9 @@ def __writeEDADetailsToExcel(self, xlsxFilename, columnTextImgone, columnTextImg pdfImage.anchor = columnTextImgone pdfPlotsheet.add_image(pdfImage) + sampledData = self.__writeSampledData(df, writer) + self.__writeEDACorrelationData(df, sampledData, writer) + self.__profilingReport(df, writer) writer.save() def __plot(self, fileName): From 7b3ed27a3b44eade3899cc560e90e76ae5a31582 Mon Sep 17 00:00:00 2001 From: Pavan Date: Fri, 2 Sep 2022 00:47:27 +0530 Subject: [PATCH 3/4] eda changes --- vevestaX/vevesta.py | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/vevestaX/vevesta.py b/vevestaX/vevesta.py index a14ccf9..152a67e 100644 --- a/vevestaX/vevesta.py +++ b/vevestaX/vevesta.py @@ -260,24 +260,24 @@ def endModelling(self): # create alias of method modellingStart and modellingEnd start = startModelling end = endModelling - # @property - # def Y(self): - # return self.__Y + @property + def Y(self): + return self.__Y - # @Y.setter - # def Y(self,value): - # if isinstance(value,Series): - # if value.size==self.__sampleSize: - # self.__Y=value - # self.____YcolumnName=None - # else: - # print('Panda series size not matching with the dataframe size') - # elif isinstance(value,str): - # if value in self.__data.columns: - # self.__Y=self.__data[value] - # self.____YcolumnName=value - # else: - # print("Column not found") + @Y.setter + def Y(self,value): + if isinstance(value,Series): + if value.size==self.__sampleSize: + self.__Y=value + self.____YcolumnName=None + else: + print('Panda series size not matching with the dataframe size') + elif isinstance(value,str): + if value in self.__data.columns: + self.__Y=self.__data[value] + self.____YcolumnName=value + else: + print("Column not found") # function to get arguments of a function def param(self, **decoratorparam): @@ -503,6 +503,10 @@ def dump(self, techniqueUsed, filename=None, message=None, version=None, showMes if (filename == None): filename = "vevesta.xlsx" + # pdfFilename = "vevesta.pdf" + # else: + # pdfFilename=filename.split('.') + # pdfFilename=pdfFilename[0]+'.pdf' # updating variables # when no V.start & v.end are not called, all variables in the code get tracked or in colab/kaggle where all variables will get tracked @@ -820,16 +824,17 @@ def __EDAForPandas(self, df, Y, xlsxFilename): #Probability Density Function numericDataframe = df.select_dtypes(include='number') - if Y is not None and (Y.dtype=='int64' or Y.dtype=='object') and Y.dtype!='float64': + if Y is not None and (Y.dtype=='int32' or Y.dtype=='int64' or Y.dtype=='object') and Y.dtype!='float64': if len(Y) == numericDataframe.shape[0]: k=1 fig = plt.figure(figsize=(20,15)) for i in numericDataframe: ax = fig.add_subplot(4,(len(numericDataframe.columns)//4)+1, k) frequency=Y.value_counts().keys().tolist()[0:10] - y=Y[self.__Y.isin(frequency)] + y=Y[Y.isin(frequency)] sns.kdeplot(x=numericDataframe[i],hue=y, ax = ax,fill=True) k+=1 + plt.suptitle('Probability Density Function',fontsize=20) plt.savefig(join(directoryToDumpData, ProbabilityDensityFunction), bbox_inches='tight', dpi=100) plt.close() @@ -918,7 +923,7 @@ def __writeEDADetailsToExcel(self, df, xlsxFilename, columnTextImgone, columnTex featureDistribution.add_image(featureDistributionImage) if exists(join(directoryToDumpData, ProbabilityDensityFunction)): - workBookName = 'EDA-PDF' + workBookName = 'EDA-ProbabilityDensityFunction' workBook.create_sheet(workBookName) pdfPlotsheet = workBook[workBookName] pdfImage = Image( From e74b916ba881d19eaca2e100e68f23a22a0269c1 Mon Sep 17 00:00:00 2001 From: Pavan Date: Fri, 2 Sep 2022 15:31:45 +0530 Subject: [PATCH 4/4] hardcoding EDA filename --- vevestaX/vevesta.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vevestaX/vevesta.py b/vevestaX/vevesta.py index 152a67e..eb5576d 100644 --- a/vevestaX/vevesta.py +++ b/vevestaX/vevesta.py @@ -701,9 +701,9 @@ def __writeSampledData(self, df, writer): return sampledData - def EDA(self, data, Y, fileName): + def EDA(self, data, Y): if isinstance(data, pandasDataframe): - self.__EDAForPandas(data, Y, fileName) + self.__EDAForPandas(data, Y, "EDA_Vevesta.xlsx") def __EDAForPandas(self, df, Y, xlsxFilename): @@ -721,7 +721,7 @@ def __EDAForPandas(self, df, Y, xlsxFilename): pdfFilename="EDA_Vevesta.pdf" else: # xlsxFilename = filename - pdfFilename=filename.split('.') + pdfFilename=xlsxFilename.split('.') pdfFilename=pdfFilename[0]+'.pdf' columnTextImgone = 'B2' @@ -1030,11 +1030,11 @@ def __getExcelSheetData(self, fileName, sheetName): def commit(self, techniqueUsed, filename=None, message=None, version=None, projectId=None, repoName=None, branch=None): self.dump(techniqueUsed, filename=filename, message=message, version=version, showMessage=False, repoName=None) - if filename is None: - pdfFilename="EDA_Vevesta.pdf" - else: - pdfFilename=filename.split('.') - pdfFilename=pdfFilename[0]+'.pdf' + # if filename is None: + pdfFilename="EDA_Vevesta.pdf" + # else: + # pdfFilename=filename.split('.') + # pdfFilename=pdfFilename[0]+'.pdf' # api-endpoint token = self.__find_access_token() backend_url = 'https://api.matrixkanban.com/services-1.0-SNAPSHOT'