Skip to content

Commit

Permalink
Ability to combine columns
Browse files Browse the repository at this point in the history
  • Loading branch information
zhendrikse committed Oct 17, 2023
1 parent 0141553 commit 323b31e
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,18 @@
from excel_data_reader import ExcelDataReader
from data_processor import DataProcessor

col1 = "Q2"
col2 = "Q4"
col3 = "Dag_treinreis"
col4 = "Q18D"
col5 = "LeeftCat"
col6 = "vertrekstation"
col7 = "aankomststation"

class Dashboard:
def __init__(self, excel_file_name):
self._init_webpage()
data_processor = DataProcessor(ExcelDataReader(excel_file_name))
self._ritten = data_processor.ritten
self._overview = data_processor.combined_sheets
#self._data = data_processor.ritten
#self._variables = data_processor.variables
self._data = data_processor.get_dataframe_for(
('Q4', 'Q4A_1', 'Q4A_2', 'Q6', 'Q8', 'HQ8', 'Q25', 'Q25A', 'Q25B', 'Q25C', 'Q25D', 'Q26', 'Q27A', 'Q27B'))
self._data = data_processor.merge_two_columns(self._data, 'Q25', 'Q25B')

# if 'dataframe' not in strmlit.session_state:
# strmlit.dataframe(self._ritten)
# strmlit.dataframe(self._data)

if 'count' not in strmlit.session_state:
strmlit.session_state.count = 0
Expand Down Expand Up @@ -51,39 +46,34 @@ def tester(self):

def _define_filters(self):
strmlit.sidebar.header("Please filter here:")
Dag_treinreis = strmlit.sidebar.multiselect(
"Select the day:",
options = self._ritten["Dag_treinreis"].unique(),
default = self._ritten["Dag_treinreis"].unique()
)
Q2 = strmlit.sidebar.multiselect(
"Select gender:",
options = self._ritten[col1].unique(),
default = self._ritten[col1].unique()
Q6 = strmlit.sidebar.multiselect(
"Laatst gemaakte treinreis:",
options = self._data['Q6'].unique(),
default = self._data['Q6'].unique()
)
Q4 = strmlit.sidebar.multiselect(
"Select days per week:",
options = self._ritten[col2].unique(),
default = self._ritten[col2].unique()
"Treinreisfrequentie de afgelopen 12 maanden:",
options = self._data['Q4'].unique(),
default = self._data['Q4'].unique()
)
Q18D = strmlit.sidebar.multiselect(
"Select purpose:",
options = self._ritten[col4].unique(),
default = self._ritten[col4].unique()
Q25_Q25B = strmlit.sidebar.multiselect(
"Soort vervoermiddel gebrach:",
options = self._data['Q25_Q25B'].unique(),
default = self._data['Q25_Q25B'].unique()
)

self._ritten = self._ritten.query(
"Dag_treinreis == @Dag_treinreis & Q2 == @Q2 & Q4 == @Q4 & Q18D == @Q18D"
self._data = self._data.query(
"Q6 == @Q6 & Q4 == @Q4 & Q25_Q25B == Q25_Q25B"
)

strmlit.dataframe(self._ritten)
strmlit.dataframe(self._data)

def cleanse(self):
self._ritten = self._ritten[self._ritten[col1].isin(["Een jongen", "Een meisje"])]
strmlit.dataframe(self._ritten)
self._data = self._data[self._data[col1].isin(["Een jongen", "Een meisje"])]
strmlit.dataframe(self._data)

def plot_chart(self):
travel_frequency_data = self._overview.value_counts("Treinreisfrequentie afgelopen 12 maanden (q4)")
travel_frequency_data = self._data.value_counts("Q4")
bar_chart = plotly.bar(
travel_frequency_data,
#x = "count",
Expand All @@ -93,12 +83,46 @@ def plot_chart(self):
#color_discrete_sequence = ["#0083B8"] * len (travel_frequency_data),
template="plotly_white"
)
strmlit.plotly_chart(bar_chart)

travel_motivation_data = self._data.value_counts("HQ8")
bar_chart_2 = plotly.bar(
travel_motivation_data,
#x = "count",
#y = travel_frequency_data.index,
#orientation = "h",
title="<b>Travel Motivation</b>",
#color_discrete_sequence = ["#0083B8"] * len (travel_frequency_data),
template="plotly_white"
)

with self.col1:
strmlit.plotly_chart(bar_chart)
with self.col2:
strmlit.plotly_chart(bar_chart_2)

labels = self._data.value_counts("Q25_Q25B").index
values = self._data.value_counts("Q25_Q25B").values
labels = [label if label.strip() else "Onbekend" for label in labels]

pie_chart = plotly.pie(
labels = labels,
values = values,
names = labels,
title="<b>Transport to station</b>",
template="plotly_white"
)
strmlit.plotly_chart(pie_chart)



def render(self):
self._define_filters()

# option = strmlit.selectbox(
# 'How would you like to be contacted?',
# self._variables.iloc[:, 1].to_list())
# strmlit.write('You selected:', option)

with self.col1:
strmlit.button("Increment", on_click=self.add)
strmlit.button("Subtract", on_click=self.subtract)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ def __init__(self, raw_data_reader):
def ritten(self):
return self._ritten

@property
def variables(self):
return self._variables

def get_column_by_code(self, column_code):
return self._ritten[column_code]

Expand All @@ -25,6 +29,21 @@ def header_values(self):
headers = list(self._ritten)
return [self.find_header_description_for(header) for header in headers]

def get_dataframe_for(self, column_codes_list):
return self._ritten.loc[:, self._ritten.columns.isin(column_codes_list)]

def merge_two_columns(self, dataframe, column_1, column_2):
# replace empty values by None
dataframe = dataframe.replace(r'^\ *$', None, regex=True)
# drop two columns that are going to be merged
dataframe_minus_cols = dataframe.drop([column_1], axis=1).drop([column_2], axis=1) #
# new dataframe with two columns merged
dataframe = pnds.concat([dataframe_minus_cols, dataframe[column_1].combine_first(dataframe[column_2])],
axis=1)
# rename newly merged column
dataframe.rename(columns={column_1:column_1 + "_" + column_2}, inplace=True)
return dataframe

@property
def combined_sheets(self):
new_headers = self.header_values()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,40 +11,54 @@

class TestDataProcessor:
@pytest.fixture(autouse=True)
def data_reader(self):
def data_processor(self):
return DataProcessor(StubRawDataReader("test/ritten-jan_mar_2023.xlsx"))

def test_replace_header_codes(self, data_reader):
assert_that(data_reader.header_values()[3], equal_to('Treinreisfrequentie afgelopen 12 maanden (q4)'))
def test_replace_header_codes(self, data_processor):
assert_that(data_processor.header_values()[3], equal_to('Treinreisfrequentie afgelopen 12 maanden (q4)'))

def test_header_description_for_non_existing_column_code(self, data_reader):
assert_that(data_reader.find_header_description_for('does_not_exist'), equal_to("does_not_exist"))
def test_header_description_for_non_existing_column_code(self, data_processor):
assert_that(data_processor.find_header_description_for('does_not_exist'), equal_to("does_not_exist"))

def test_header_description_for_existing_column_code(self, data_reader):
assert_that(data_reader.find_header_description_for('Q2'), equal_to("Geslacht kind (q2)"))
assert_that(data_reader.find_header_description_for('Q59'), equal_to("Samenstelling huishouden (q59)"))
def test_header_description_for_existing_column_code(self, data_processor):
assert_that(data_processor.find_header_description_for('Q2'), equal_to("Geslacht kind (q2)"))
assert_that(data_processor.find_header_description_for('Q59'), equal_to("Samenstelling huishouden (q59)"))

def test_updated_dataframe_headers(self, data_reader):
updated_frame = data_reader.combined_sheets
def test_updated_dataframe_headers(self, data_processor):
updated_frame = data_processor.combined_sheets
headers = list(updated_frame)

assert_that(data_reader.combined_sheets.shape, equal_to((ROWS_TO_READ, DATA_ROW_COUNT)))
assert_that(data_reader.header_values(), equal_to(headers))
assert_that(data_processor.combined_sheets.shape, equal_to((ROWS_TO_READ, DATA_ROW_COUNT)))
assert_that(data_processor.header_values(), equal_to(headers))

def test_read_questionnaire(self, data_reader):
def test_read_questionnaire(self, data_processor):
expected_res = pnds.Series(["Een jongen", "Een jongen", "Een meisje", "Een meisje", "Wil niet zeggen",
"Een meisje", "Een jongen", "Een meisje", "Een meisje", "Een jongen"])

assert_that(data_reader.ritten.shape, equal_to((ROWS_TO_READ, DATA_ROW_COUNT)))
pnds.testing.assert_series_equal(data_reader.ritten['Q2'], expected_res, check_names=False)

def test_bla(self, data_reader):
print(data_reader.combined_sheets.value_counts('Grootte huishouden (q60)', normalize=True))
print(data_reader.combined_sheets.value_counts('Geslacht (q58)', normalize=True))
#print(data_reader.combined_sheets.value_counts('Leeftijdscategorie (q57)', normalize=True))
#print(data_reader.ritten.value_counts('Q65', normalize=True))

#print(data_reader.ritten.value_counts('Q56_1', normalize=True))
print(data_reader.combined_sheets.value_counts("Treinreisfrequentie afgelopen 12 maanden (q4)", normalize=True))
#print(data_reader.combined_sheets.groupby(by=["Geslacht (q58)"]).sum())
assert_that(data_processor.ritten.shape, equal_to((ROWS_TO_READ, DATA_ROW_COUNT)))
pnds.testing.assert_series_equal(data_processor.ritten['Q2'], expected_res, check_names=False)

def test_get_data_frame_for_list_of_columns(self, data_processor):
selected_columns = ('Q2', 'Q4', 'Q5', 'Q6')
test_data_frame = data_processor.get_dataframe_for(selected_columns)
assert_that(data_processor.combined_sheets.shape, equal_to((ROWS_TO_READ, DATA_ROW_COUNT)))
expected_res = pnds.Series(["Nee", "Nee", "Nee", "Nee", "Nee", "Nee", "Nee", "Nee", "Nee", "Nee"])
pnds.testing.assert_series_equal(test_data_frame['Q5'], expected_res, check_names=False)

def test_merge_columns(self, data_processor):
dataframe = data_processor.get_dataframe_for(('Q25A', 'Q25', 'Q25B'))
dataframe = data_processor.merge_two_columns(dataframe, 'Q25', 'Q25B')

expected_res = pnds.Series(["Fiets", "Lopend", "Fiets", "Auto", "Lopend", "Auto", "Fiets", "Auto", "Lopend", "Met de taxi, NS zonetaxi, regiotaxi"])
pnds.testing.assert_series_equal(dataframe['Q25_Q25B'], expected_res, check_names=False)

#def test_bla(self, data_processor):
#print(data_processor.combined_sheets.value_counts('Grootte huishouden (q60)', normalize=True))
#print(data_processor.combined_sheets.value_counts('Geslacht (q58)', normalize=True))
#print(data_processor.combined_sheets.value_counts('Leeftijdscategorie (q57)', normalize=True))
#print(data_processor.ritten.value_counts('Q65', normalize=True))

#print(data_processor.ritten.value_counts('Q56_1', normalize=True))
#print(data_processor.combined_sheets.value_counts("Treinreisfrequentie afgelopen 12 maanden (q4)", normalize=True))
#print(data_processor.combined_sheets.groupby(by=["Geslacht (q58)"]).sum())

0 comments on commit 323b31e

Please sign in to comment.