From e42952ef007b7e298b86c5960c3c30848bce41cb Mon Sep 17 00:00:00 2001 From: Maciej Ziaja Date: Thu, 2 Jan 2025 17:17:12 +0100 Subject: [PATCH] More cleanups and consistent colors --- it_jobs_meta/dashboard/assets/styles.css | 4 + it_jobs_meta/dashboard/dashboard.py | 36 +++- .../dashboard/dashboard_components.py | 187 +++++++++--------- it_jobs_meta/dashboard/data_provision.py | 20 ++ it_jobs_meta/dashboard/layout.py | 2 +- 5 files changed, 154 insertions(+), 95 deletions(-) diff --git a/it_jobs_meta/dashboard/assets/styles.css b/it_jobs_meta/dashboard/assets/styles.css index d15fba9..85beff9 100644 --- a/it_jobs_meta/dashboard/assets/styles.css +++ b/it_jobs_meta/dashboard/assets/styles.css @@ -11,6 +11,10 @@ p a:hover { color: rgb(33, 37, 41); } +.sticky-top { + top: 1em; +} + ._dash-loading { width: 0; height: 0; diff --git a/it_jobs_meta/dashboard/dashboard.py b/it_jobs_meta/dashboard/dashboard.py index 7639f6b..bd4b853 100644 --- a/it_jobs_meta/dashboard/dashboard.py +++ b/it_jobs_meta/dashboard/dashboard.py @@ -7,13 +7,13 @@ import dash import dash_bootstrap_components as dbc import pandas as pd -from dash import Input, Output, callback, dcc, html +from dash import Input, Output, callback from dash.development.base_component import Component as DashComponent from flask_caching import Cache as AppCache from waitress import serve as wsgi_serve from it_jobs_meta.common.utils import setup_logging -from it_jobs_meta.dashboard.dashboard_components import GraphRegistry +from it_jobs_meta.dashboard.dashboard_components import make_colormap, make_graphs from it_jobs_meta.dashboard.data_provision import MongodbDashboardDataProvider from it_jobs_meta.dashboard.layout import ( LayoutDynamicContent, @@ -35,6 +35,9 @@ def __init__( self._data_provider: MongodbDashboardDataProvider = data_provider self._layout_template_parameters: LayoutTemplateParameters = layout_template_parameters self._cache_timeout: timedelta = cache_timeout + self._technologies_cmap: dict[str, str] | None = None + self._categories_cmap: dict[str, str] | None = None + self._seniorities_cmap: dict[str, str] | None = None @property def app(self) -> dash.Dash: @@ -69,10 +72,25 @@ def make_layout(self) -> DashComponent: logging.info('Attempting to retrieve data') metadata_df = self._data_provider.fetch_metadata() data_df = self._data_provider.fetch_data(metadata_df.iloc[-1]['batch_id']) + self._technologies_cmap = make_colormap( + self._data_provider.fetch_field_values_by_count('technology') + ) + self._categories_cmap = make_colormap( + self._data_provider.fetch_field_values_by_count('category') + ) + self._seniorities_cmap = make_colormap( + self._data_provider.fetch_field_values_by_count('seniority') + ) logging.info('Data retrieval succeeded') logging.info('Making layout') - dynamic_content = self.make_dynamic_content(metadata_df, data_df) + dynamic_content = self.make_dynamic_content( + metadata_df, + data_df, + self._technologies_cmap, + self._categories_cmap, + self._seniorities_cmap, + ) layout = make_layout(self._layout_template_parameters, dynamic_content) logging.info('Making layout succeeded') logging.info('Rendering dashboard succeeded') @@ -87,7 +105,9 @@ def register_callbacks(self) -> DashComponent: def update_graphs_section(value): metadata_df = self._data_provider.fetch_metadata() data_df = self._data_provider.fetch_data(metadata_df.iloc[value]['batch_id']) - graphs = GraphRegistry.make(data_df) + graphs = make_graphs( + data_df, self._technologies_cmap, self._categories_cmap, self._seniorities_cmap + ) return make_graphs_layout(graphs) def run(self, with_wsgi=False): @@ -109,10 +129,14 @@ def run(self, with_wsgi=False): @staticmethod def make_dynamic_content( - metadata_df: pd.DataFrame, data_df: pd.DataFrame + metadata_df: pd.DataFrame, + data_df: pd.DataFrame, + technologies_cmap: dict[str, str], + categories_cmap: dict[str, str], + seniorities_cmap: dict[str, str], ) -> LayoutDynamicContent: obtained_datetime = metadata_df['obtained_datetime'] - graphs = GraphRegistry.make(data_df) + graphs = make_graphs(data_df, technologies_cmap, categories_cmap, seniorities_cmap) return LayoutDynamicContent(obtained_datetime=obtained_datetime, graphs=graphs) diff --git a/it_jobs_meta/dashboard/dashboard_components.py b/it_jobs_meta/dashboard/dashboard_components.py index c573ef8..a50b0b6 100644 --- a/it_jobs_meta/dashboard/dashboard_components.py +++ b/it_jobs_meta/dashboard/dashboard_components.py @@ -1,6 +1,5 @@ """Data dashboard components and graphs.""" -from abc import ABC, abstractmethod from enum import Enum, auto from typing import Any @@ -9,8 +8,11 @@ from dash import dcc from plotly import express as px from plotly import graph_objects as go +from plotly.colors import qualitative from sklearn import preprocessing +SENIORITIES_ORDER = ['Trainee', 'Junior', 'Mid', 'Senior', 'Expert'] + def get_n_most_frequent_vals_in_col(col: pd.Series, n: int) -> list[Any]: return col.value_counts().nlargest(n).index.to_list() @@ -23,20 +25,6 @@ def get_rows_with_n_most_frequent_vals_in_col( return df[df[col_name].isin(n_most_freq)] -def sort_by_seniority(df: pd.DataFrame) -> pd.DataFrame: - """Sorts rows according to the seniority---least to most experienced.""" - SENIORITY_ORDER = { - 'Trainee': 0, - 'Junior': 1, - 'Mid': 2, - 'Senior': 3, - 'Expert': 4, - } - - sorted = df.sort_values('seniority', key=lambda x: x.map(SENIORITY_ORDER)) - return sorted - - def move_legend_to_top(fig: go.Figure) -> go.Figure: fig.update_layout( legend={ @@ -55,6 +43,12 @@ def center_title(fig: go.Figure) -> go.Figure: return fig +def make_colormap(values: list[str]) -> dict[str, str]: + palette = qualitative.Plotly + qualitative.Pastel + qualitative.Pastel2 + extended_palette = palette * (len(values) // len(palette) + 1) + return dict(zip(values, extended_palette[: len(values)])) + + class Graph(Enum): REMOTE_PIE_CHART = auto() TECHNOLOGIES_PIE_CHART = auto() @@ -70,63 +64,64 @@ class Graph(Enum): SALARIES_MAP_SENIOR = auto() -class GraphFigure(ABC): - @classmethod - @abstractmethod - def make_fig(cls, postings_df: pd.DataFrame) -> go.Figure: - """Make the figure using the given data frame.""" - - -class GraphRegistry: - """Registry for automatic gathering and creation of graph figures.""" - - _graph_makers: dict[Graph, GraphFigure] = {} - - @classmethod - def register(cls, key: Graph): - """Add given graph implementation to the registry.""" - return lambda graph_figure: cls._register_inner(key, graph_figure) +def make_graphs( + postings_df: pd.DataFrame, + technologies_cmap: dict[str, str], + categories_cmap: dict[str, str], + seniorities_cmap: dict[str, str], +) -> dict[Graph, dcc.Graph]: + figures = { + Graph.REMOTE_PIE_CHART: RemotePieChart.make_fig(postings_df), + Graph.TECHNOLOGIES_PIE_CHART: TechnologiesPieChart.make_fig(postings_df, technologies_cmap), + Graph.CATEGORIES_PIE_CHART: CategoriesPieChart.make_fig(postings_df, categories_cmap), + Graph.SENIORITY_PIE_CHART: SeniorityPieChart.make_fig(postings_df, seniorities_cmap), + Graph.CAT_TECH_SANKEY_CHART: CategoriesTechnologiesSankeyChart.make_fig( + postings_df, technologies_cmap, categories_cmap + ), + Graph.SALARIES_MAP: SalariesMap.make_fig(postings_df), + Graph.SENIORITIES_HISTOGRAM: SenioritiesHistogram.make_fig(postings_df, seniorities_cmap), + Graph.TECHNOLOGIES_VIOLIN_PLOT: TechnologiesViolinChart.make_fig(postings_df), + Graph.CONTRACT_TYPE_VIOLIN_PLOT: ContractTypeViolinChart.make_fig(postings_df), + Graph.SALARIES_MAP_JUNIOR: SalariesMapJunior.make_fig(postings_df), + Graph.SALARIES_MAP_MID: SalariesMapMid.make_fig(postings_df), + Graph.SALARIES_MAP_SENIOR: SalariesMapSenior.make_fig(postings_df), + } - @classmethod - def make(cls, postings_df: pd.DataFrame) -> dict[Graph, dcc.Graph]: - """Make all registered graphs using the given data and get them.""" - graphs: dict[Graph, go.Figure] = {} - for graph_key in cls._graph_makers: - graphs[graph_key] = dcc.Graph(figure=cls._graph_makers[graph_key].make_fig(postings_df)) - return graphs + graphs = {graph_key: dcc.Graph(figure=figures[graph_key]) for graph_key in figures} + return graphs - @classmethod - def _register_inner(cls, key: Graph, graph_figure: GraphFigure): - cls._graph_makers[key] = graph_figure - return graph_figure - -@GraphRegistry.register(key=Graph.TECHNOLOGIES_PIE_CHART) -class TechnologiesPieChart(GraphFigure): +class TechnologiesPieChart: TITLE = 'Main technology' N_MOST_FREQ = 12 @classmethod - def make_fig(cls, postings_df: pd.DataFrame) -> go.Figure: + def make_fig(cls, postings_df: pd.DataFrame, cmap: dict[str, str] | None = None) -> go.Figure: tech_most_freq_df = get_rows_with_n_most_frequent_vals_in_col( postings_df, 'technology', cls.N_MOST_FREQ ) technology_counts = tech_most_freq_df['technology'].value_counts().reset_index() technology_counts.columns = ['technology', 'count'] - fig = px.pie(technology_counts, names='technology', values='count', title=cls.TITLE) + fig = px.pie( + technology_counts, + names='technology', + color='technology', + values='count', + title=cls.TITLE, + color_discrete_map=cmap, + ) fig.update_traces(textposition='inside') fig = center_title(fig) return fig -@GraphRegistry.register(key=Graph.CATEGORIES_PIE_CHART) -class CategoriesPieChart(GraphFigure): +class CategoriesPieChart: TITLE = 'Main category' N_MOST_FREQ = 12 @classmethod - def make_fig(cls, postings_df: pd.DataFrame) -> go.Figure: + def make_fig(cls, postings_df: pd.DataFrame, cmap: dict[str, str] | None = None) -> go.Figure: # Get the most frequent categories and their counts cat_largest_df = get_rows_with_n_most_frequent_vals_in_col( postings_df, 'category', cls.N_MOST_FREQ @@ -135,21 +130,32 @@ def make_fig(cls, postings_df: pd.DataFrame) -> go.Figure: category_counts.columns = ['category', 'count'] # Create a pie chart with count values - fig = px.pie(category_counts, names='category', values='count', title=cls.TITLE) + fig = px.pie( + category_counts, + names='category', + color='category', + values='count', + title=cls.TITLE, + color_discrete_map=cmap, + ) fig.update_traces(textposition='inside') fig = center_title(fig) return fig -@GraphRegistry.register(key=Graph.CAT_TECH_SANKEY_CHART) -class CategoriesTechnologiesSankeyChart(GraphFigure): +class CategoriesTechnologiesSankeyChart: TITLE = 'Categories and technologies share' N_MOST_FREQ_CAT = 12 N_MOST_FREQ_TECH = 12 MIN_FLOW = 12 @classmethod - def make_fig(cls, postings_df: pd.DataFrame) -> go.Figure: + def make_fig( + cls, + postings_df: pd.DataFrame, + tech_cmap: dict[str, str] | None = None, + catgr_cmap: dict[str, str] | None = None, + ) -> go.Figure: cat_most_freq = get_n_most_frequent_vals_in_col( postings_df['category'], cls.N_MOST_FREQ_CAT ) @@ -175,15 +181,17 @@ def make_fig(cls, postings_df: pd.DataFrame) -> go.Figure: sources_e = label_encoder.transform(sources) targets_e = label_encoder.transform(targets) + unique_labels = np.unique(sources + targets) + if tech_cmap is not None and catgr_cmap is not None: + colors = [catgr_cmap.get(label) or tech_cmap.get(label) for label in unique_labels] + else: + colors = None + fig = go.Figure( data=[ go.Sankey( - node={'label': np.unique(sources + targets)}, - link={ - 'source': sources_e, - 'target': targets_e, - 'value': values, - }, + node={'label': unique_labels, 'color': colors}, + link={'source': sources_e, 'target': targets_e, 'value': values}, ) ] ) @@ -192,33 +200,47 @@ def make_fig(cls, postings_df: pd.DataFrame) -> go.Figure: return fig -@GraphRegistry.register(key=Graph.SENIORITY_PIE_CHART) -class SeniorityPieChart(GraphFigure): +class SeniorityPieChart: TITLE = 'Seniority' @classmethod - def make_fig(cls, postings_df: pd.DataFrame) -> go.Figure: + def make_fig(cls, postings_df: pd.DataFrame, cmap: dict[str, str] | None = None) -> go.Figure: postings_df = postings_df.explode('seniority') seniority_counts = postings_df['seniority'].value_counts().reset_index() seniority_counts.columns = ['seniority', 'count'] - fig = px.pie(seniority_counts, values='count', names='seniority', title=cls.TITLE) + + fig = px.pie( + seniority_counts, + values='count', + color='seniority', + names='seniority', + title=cls.TITLE, + color_discrete_map=cmap, + category_orders={'seniority': SENIORITIES_ORDER}, + ) fig = center_title(fig) return fig -@GraphRegistry.register(key=Graph.SENIORITIES_HISTOGRAM) -class SenioritiesHistogram(GraphFigure): +class SenioritiesHistogram: TITLE = 'Histogram' MAX_SALARY = 40000 @classmethod - def make_fig(cls, postings_df) -> go.Figure: + def make_fig(cls, postings_df, cmap: dict[str, str] | None = None) -> go.Figure: postings_df = postings_df.explode('seniority') postings_df = postings_df[postings_df['salary_mean'] < cls.MAX_SALARY] postings_df = postings_df[postings_df['salary_mean'] > 0] - postings_df = sort_by_seniority(postings_df) - fig = px.histogram(postings_df, x='salary_mean', color='seniority', title=cls.TITLE) + fig = px.histogram( + postings_df, + x='salary_mean', + color='seniority', + nbins=50, + title=cls.TITLE, + color_discrete_map=cmap, + category_orders={'seniority': SENIORITIES_ORDER}, + ) fig = fig.update_layout( legend_title_text=None, xaxis_title_text='Mean salary (PLN)', @@ -228,8 +250,7 @@ def make_fig(cls, postings_df) -> go.Figure: return fig -@GraphRegistry.register(key=Graph.REMOTE_PIE_CHART) -class RemotePieChart(GraphFigure): +class RemotePieChart: TITLE = 'Fully remote work possible' @classmethod @@ -241,8 +262,7 @@ def make_fig(cls, postings_df: pd.DataFrame) -> go.Figure: return fig -@GraphRegistry.register(key=Graph.SALARIES_MAP) -class SalariesMap(GraphFigure): +class SalariesMap: TITLE = 'Mean salary by location (PLN)' N_MOST_FREQ = 15 POLAND_LAT, POLAND_LON = 52.0, 19.0 @@ -305,8 +325,7 @@ def make_fig( return fig -@GraphRegistry.register(key=Graph.SALARIES_MAP_JUNIOR) -class SalariesMapJunior(GraphFigure): +class SalariesMapJunior: TITLE = 'Mean salary for Juniors' @classmethod @@ -321,8 +340,7 @@ def make_fig( return fig -@GraphRegistry.register(key=Graph.SALARIES_MAP_MID) -class SalariesMapMid(GraphFigure): +class SalariesMapMid: TITLE = 'Mean salary for Mids' @classmethod @@ -337,8 +355,7 @@ def make_fig( return fig -@GraphRegistry.register(key=Graph.SALARIES_MAP_SENIOR) -class SalariesMapSenior(GraphFigure): +class SalariesMapSenior: TITLE = 'Mean salary for Seniors' @classmethod @@ -350,24 +367,19 @@ def make_fig(cls, postings_df) -> go.Figure: return fig -@GraphRegistry.register(key=Graph.TECHNOLOGIES_VIOLIN_PLOT) -class TechnologiesViolinChart(GraphFigure): +class TechnologiesViolinChart: TITLE = 'Violin plot split by seniority' MAX_SALARY = 35000 N_MOST_FREQ_TECH = 8 @classmethod - def make_fig( - cls, - postings_df, - ) -> go.Figure: + def make_fig(cls, postings_df, seniorities_cmap: dict[str, str] | None = None) -> go.Figure: postings_df = postings_df.explode('seniority') tech_most_freq = get_rows_with_n_most_frequent_vals_in_col( postings_df, 'technology', cls.N_MOST_FREQ_TECH ) limited = tech_most_freq[tech_most_freq['salary_mean'] < cls.MAX_SALARY] limited = limited[limited['seniority'].isin(('Junior', 'Mid', 'Senior'))] - limited = sort_by_seniority(limited) # Plotly has problems with creating violin plots if there are too few # samples, we filter out seniority and technology paris for which # there aren't enough data points to make a nice curve @@ -396,8 +408,7 @@ def make_fig( return fig -@GraphRegistry.register(key=Graph.CONTRACT_TYPE_VIOLIN_PLOT) -class ContractTypeViolinChart(GraphFigure): +class ContractTypeViolinChart: TITLE = 'Violin plot split by contract' MAX_SALARY = 40000 N_MOST_FREQ_TECH = 8 diff --git a/it_jobs_meta/dashboard/data_provision.py b/it_jobs_meta/dashboard/data_provision.py index 8897a8d..031d851 100644 --- a/it_jobs_meta/dashboard/data_provision.py +++ b/it_jobs_meta/dashboard/data_provision.py @@ -60,3 +60,23 @@ def fetch_data(self, batch_id: str | None = None) -> pd.DataFrame: if len(df) == 0: raise ValueError('Found no data, dashboard cannot be made') return df + + def fetch_field_values_by_count(self, field: str) -> list[str]: + client: MongoClient + with MongoClient( + self.host, self.port, username=self.user_name, password=self.password + ) as client: + db: Database = client[self.db_name] + collection = db['postings'] + + return [ + doc["_id"] + for doc in collection.aggregate( + [ + {"$match": {field: {"$ne": None}}}, + {"$unwind": f"${field}"}, + {"$group": {"_id": f"${field}", "count": {"$sum": 1}}}, + {"$sort": {"count": -1}}, + ] + ) + ] diff --git a/it_jobs_meta/dashboard/layout.py b/it_jobs_meta/dashboard/layout.py index c1c7270..6aa5f42 100644 --- a/it_jobs_meta/dashboard/layout.py +++ b/it_jobs_meta/dashboard/layout.py @@ -123,7 +123,7 @@ def make_timeline_slider(obtained_datetime) -> DashComponent: slider_marks = dict(enumerate([t.strftime('%b %Y') for t in obtained_datetime])) return dcc.Slider( id='batch-slider', - className='text-nowrap mx-3 my-5', + className='text-nowrap sticky-top bg-white mx-4 my-3 p-5 rounded-pill shadow', step=None, marks=slider_marks, value=len(slider_marks) - 1,