Skip to content

Commit

Permalink
Docstrings and small fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
maciejzj committed Mar 26, 2022
1 parent 3071c38 commit 0dd5e6b
Show file tree
Hide file tree
Showing 14 changed files with 228 additions and 79 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ end
```

Notice that the terms *data lake* and *data warehouse* are used in a rather
loose way in the following descriptions.
loose/naive way in the following descriptions.

## Setup

Expand Down
2 changes: 2 additions & 0 deletions it_jobs_meta/__main__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Run the data pipeline or dashboard using CLI options."""

from it_jobs_meta.common.cli import CliArgumentParser
from it_jobs_meta.common.utils import setup_logging
from it_jobs_meta.dashboard.dashboard import (
Expand Down
19 changes: 19 additions & 0 deletions it_jobs_meta/common/cli.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Command line parser for the it-jobs-meta application."""

import argparse
from pathlib import Path
from typing import Any
Expand All @@ -8,6 +10,8 @@


class CliArgumentParser:
"""Command line parser for the it-jobs-meta application."""

PROG = 'it-jobs-meta'
DESCRIPTION = (
'Data pipeline and meta-analysis dashboard for IT job postings'
Expand Down Expand Up @@ -39,6 +43,11 @@ def args(self) -> dict[str, Any]:
return self._args

def extract_data_lake(self) -> tuple[DataLakeImpl, Path]:
"""Extract data lake setup from the arguments.
:return: Tuple with the selected data lake implementation type and
the config path.
"""
match self.args:
case {'redis': Path(), 's3_bucket': None}:
return DataLakeImpl.REDIS, self.args['redis']
Expand All @@ -51,6 +60,11 @@ def extract_data_lake(self) -> tuple[DataLakeImpl, Path]:
)

def extract_etl_loader(self) -> tuple[EtlLoaderImpl, Path]:
"""Get the ETL loader setup from the arguments.
:return: Tuple with the selected etl loader implementation type and
the config path.
"""
match self.args:
case {'mongodb': Path(), 'sql': None}:
return EtlLoaderImpl.MONGODB, self.args['mongodb']
Expand All @@ -63,6 +77,11 @@ def extract_etl_loader(self) -> tuple[EtlLoaderImpl, Path]:
)

def extract_data_provider(self) -> tuple[DashboardProviderImpl, Path]:
"""Get the dashboard data provider setup from the arguments.
:return: Tuple with the selected data provider implementation type and
the config path.
"""
match self.args:
case {'mongodb': Path()}:
return DashboardProviderImpl.MONGODB, self.args['mongodb']
Expand Down
6 changes: 6 additions & 0 deletions it_jobs_meta/common/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Utility tools shared across the application."""

import logging
import sys
from pathlib import Path
Expand All @@ -7,6 +9,10 @@


def setup_logging(*args: Path):
"""Enable logging to stdout and the given files.
:param *args: Paths to log output files.
"""
log_file_handlers = []
for log_path in args:
log_path.parent.mkdir(exist_ok=True, parents=True)
Expand Down
18 changes: 10 additions & 8 deletions it_jobs_meta/dashboard/dashboard.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Dashboard server for job postings data visualization."""

import logging
from datetime import timedelta
from pathlib import Path
Expand All @@ -14,7 +16,6 @@
from it_jobs_meta.dashboard.data_provision import (
DashboardDataProviderFactory,
DashboardProviderImpl,
GatheredData,
)
from it_jobs_meta.dashboard.layout import DynamicContent, make_layout

Expand Down Expand Up @@ -66,11 +67,11 @@ def cache(self) -> AppCache:
def render_layout(self) -> DashComponent:
logging.info('Rendering dashboard')
logging.info('Attempting to retrieve data')
data = self._data_provider_factory.make().gather_data()
metadata_df, data_df = self._data_provider_factory.make().gather_data()
logging.info('Data retrieval succeeded')

logging.info('Making layout')
dynamic_content = self.make_dynamic_content(data)
dynamic_content = self.make_dynamic_content(metadata_df, data_df)
layout = make_layout(dynamic_content)
logging.info('Making layout succeeded')
logging.info('Rendering dashboard succeeded')
Expand Down Expand Up @@ -98,17 +99,18 @@ def run(self, with_wsgi=False):
raise

@staticmethod
def make_dynamic_content(data: GatheredData) -> DynamicContent:
obtained_datetime = pd.to_datetime(
data.metadata['obtained_datetime'][0]
)
graphs = GraphRegistry.make(data.postings)
def make_dynamic_content(
metadata_df: pd.DataFrame, data_df: pd.DataFrame
) -> DynamicContent:
obtained_datetime = pd.to_datetime(metadata_df['obtained_datetime'][0])
graphs = GraphRegistry.make(data_df)
return DynamicContent(
obtained_datetime=obtained_datetime, graphs=graphs
)


def main():
"""Run the demo dashboard with short cache timout (for development)."""
setup_logging()
data_warehouse_config_path = Path('config/mongodb_config.yml')
data_provider_factory = DashboardDataProviderFactory(
Expand Down
17 changes: 15 additions & 2 deletions it_jobs_meta/dashboard/dashboard_components.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Data dashboard components and graphs."""

from abc import ABC, abstractmethod
from enum import Enum, auto
from typing import Any
Expand All @@ -22,7 +24,14 @@ def get_rows_with_n_most_frequent_vals_in_col(


def sort_by_seniority(df: pd.DataFrame) -> pd.DataFrame:
SENIORITY_ORDER = {'Trainee': 0, 'Junior': 1, 'Mid': 2, 'Senior': 3}
"""Sorts rows according to the seniority---least to most experienced."""
SENIORITY_ORDER = {
'Trainee': 0,
'Junior': 1,
'Mid': 2,
'Senior': 3,
'Expert': 4,
}

sorted = df.sort_values('seniority', key=lambda x: x.map(SENIORITY_ORDER))
return sorted
Expand Down Expand Up @@ -65,18 +74,22 @@ class GraphFigure(ABC):
@classmethod
@abstractmethod
def make_fig(cls, postings_df: pd.DataFrame) -> go.Figure:
pass
"""Make the figure using the given data frame."""


class GraphRegistry:
"""Registry for automatic gathering and creation of graph figures."""

_graph_makers: dict[Graph, GraphFigure] = {}

@classmethod
def register(cls, key: Graph):
"""Add given graph implementation to the registry."""
return lambda graph_figure: cls._register_inner(key, graph_figure)

@classmethod
def make(cls, postings_df: pd.DataFrame) -> dict[Graph, dcc.Graph]:
"""Make all registered graphs using the given data and get them."""
graphs: dict[Graph, go.Figure] = {}
for graph_key in cls._graph_makers:
graphs[graph_key] = dcc.Graph(
Expand Down
26 changes: 15 additions & 11 deletions it_jobs_meta/dashboard/data_provision.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Data provision and data source for the data dashboard."""

from abc import ABC, abstractmethod
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path

Expand All @@ -9,16 +10,14 @@
from it_jobs_meta.common.utils import load_yaml_as_dict


@dataclass
class GatheredData:
metadata: pd.DataFrame
postings: pd.DataFrame


class DashboardDataProvider(ABC):
@abstractmethod
def gather_data(self) -> GatheredData:
pass
def gather_data(self) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Gather data for the dashboard.
:return: Tuple with metadata and data dataframes as (metadata_df,
data_df)
"""


class MongodbDashboardDataProvider(DashboardDataProvider):
Expand All @@ -41,14 +40,19 @@ def from_config_file(
) -> 'MongodbDashboardDataProvider':
return cls(**load_yaml_as_dict(config_file_path))

def gather_data(self) -> GatheredData:
def gather_data(self) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Gather data for the dashboard.
:return: Tuple with metadata and data dataframes as (metadata_df,
data_df)
"""
metadata_df = pd.json_normalize(self._db['metadata'].find())
postings_df = pd.json_normalize(self._db['postings'].find())
if metadata_df.empty or postings_df.empty:
raise RuntimeError(
'Data gather for the dashboard resulted in empty datasets'
)
return GatheredData(metadata=metadata_df, postings=postings_df)
return metadata_df, postings_df


class DashboardProviderImpl(Enum):
Expand Down
2 changes: 2 additions & 0 deletions it_jobs_meta/dashboard/layout.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Dashboard layout and components stitching."""

from dataclasses import dataclass
from datetime import datetime

Expand Down
Loading

0 comments on commit 0dd5e6b

Please sign in to comment.