Skip to content

Commit

Permalink
Merge pull request #52 from maciejzj/develop
Browse files Browse the repository at this point in the history
Release changes v0.1.1 to master

Changelog included in the release git tag.
  • Loading branch information
maciejzj authored Feb 5, 2023
2 parents 2daf4ba + c53c4e0 commit 922af0c
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 12 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ Resort to the command line help to discover available options:

```
$ it-jobs-meta -h
usage: it-jobs-meta [-h] [-l LOG_PATH] {pipeline,dashboard} ...
usage: it-jobs-meta [-h] [-v {debug,info,warning,critical,error}]
[-l LOG_PATH]
{pipeline,dashboard} ...
Data pipeline and meta-analysis dashboard for IT job postings
Expand All @@ -83,6 +85,8 @@ positional arguments:
options:
-h, --help show this help message and exit
-v {debug,info,warning,critical,error}, --log-level {debug,info,warning,critical,error}
set verbosity/log level of the program (default: info)
-l LOG_PATH, --log-path LOG_PATH
path to the log file (default: var/it_jobs_meta.log)
```
Expand Down
34 changes: 23 additions & 11 deletions it_jobs_meta/data_pipeline/data_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,11 @@ class EtlTransformationEngine(Generic[ProcessDataType], ABC):
'referralBonusCurrency',
]

# Look at the ETL pipeline implementation to see the prefined order of the
# string formatting operations in columns.
COLS_TO_LOWER = {
# Look at the ETL pipeline implementation to see the predefined order of
# the string formatting operations in columns.
COLS_TO_LOWER = [
'technology',
'category',
}
]

# In any column replace these strings
VALS_TO_REPLACE = {
Expand All @@ -75,6 +74,11 @@ class EtlTransformationEngine(Generic[ProcessDataType], ABC):
'react': 'javascript',
}

# Apply transformation like 'businessAnalyst' -> 'business Analyst'
COLS_TO_SPLIT_ON_CAPITAL_LETTERS = [
'category',
]

# Title case text is like "Sample Text".
COLS_TO_TITLE_CASE = ['category']

Expand Down Expand Up @@ -112,6 +116,10 @@ def unify_to_lower(self, data: ProcessDataType) -> ProcessDataType:
def replace_values(self, data: ProcessDataType) -> ProcessDataType:
"""Replace values specified in COLS_TO_DROP."""

@abstractmethod
def split_on_capitals(self, data: ProcessDataType) -> ProcessDataType:
"""Transform like 'businessAnalyst' -> 'business Analyst'."""

@abstractmethod
def to_title_case(self, data: ProcessDataType) -> ProcessDataType:
"""Transform columns in COLS_TO_TITLE_CASE to title case.
Expand Down Expand Up @@ -206,6 +214,7 @@ def transform(self, data: ProcessDataType) -> ProcessDataType:
data = self._transformation_engine.extract_salaries(data)
data = self._transformation_engine.unify_to_lower(data)
data = self._transformation_engine.replace_values(data)
data = self._transformation_engine.split_on_capitals(data)
data = self._transformation_engine.to_title_case(data)
data = self._transformation_engine.to_capitalized(data)
data = self._transformation_engine.unify_missing_values(data)
Expand Down Expand Up @@ -270,19 +279,22 @@ def drop_duplicates(self, data: pd.DataFrame) -> pd.DataFrame:

def unify_to_lower(self, data: pd.DataFrame) -> pd.DataFrame:
for col in EtlTransformationEngine.COLS_TO_LOWER:
data[col] = data[col][data[col].notna()].transform(
lambda s: s.lower()
)
data[col] = data[col].str.lower()
return data

def replace_values(self, data: pd.DataFrame) -> pd.DataFrame:
return data.replace(to_replace=EtlTransformationEngine.VALS_TO_REPLACE)

def split_on_capitals(self, data: pd.DataFrame) -> pd.DataFrame:
for col in EtlTransformationEngine.COLS_TO_SPLIT_ON_CAPITAL_LETTERS:
data[col] = data[col].str.replace(
r'(\w)([A-Z])', r'\1 \2', regex=True
)
return data

def to_title_case(self, data: pd.DataFrame) -> pd.DataFrame:
for col in EtlTransformationEngine.COLS_TO_TITLE_CASE:
data[col] = data[col][data[col].notna()].transform(
lambda s: re.sub(r'([A-Z])', r' \1', s).title()
)
data[col] = data[col].str.title()
return data

def to_capitalized(self, data: pd.DataFrame) -> pd.DataFrame:
Expand Down

0 comments on commit 922af0c

Please sign in to comment.