diff --git a/150_group_sold_products_by_date.py b/150_group_sold_products_by_date.py new file mode 100644 index 0000000..7fae7cc --- /dev/null +++ b/150_group_sold_products_by_date.py @@ -0,0 +1,13 @@ +# Group the activities DF using 'sell_date'. agg() function is used because we want to apply multiple +# functionality to the same grouping. Using agg() we get the total number of unique products sold and also +# get the names of all those unique products by joining a string of sorted set of product names. +# Note: agg() function directly returns a DF, not a series like we get using transform() + +import pandas as pd + +def categorize_products(activities: pd.DataFrame) -> pd.DataFrame: + df = activities.groupby(['sell_date']).agg( + num_sold = ('product', 'nunique'), + products = ('product', lambda x: ','.join(sorted(set(x)))) + ).reset_index() + return df \ No newline at end of file diff --git a/151_daily_leads_and_partners.py b/151_daily_leads_and_partners.py new file mode 100644 index 0000000..ca756c6 --- /dev/null +++ b/151_daily_leads_and_partners.py @@ -0,0 +1,11 @@ +# Group the daily_sales DF using both 'date_id' and 'make_name' and return unique lead IDs and unique +# partner IDs associated with it. + +import pandas as pd + +def daily_leads_and_partners(daily_sales: pd.DataFrame) -> pd.DataFrame: + df = daily_sales.groupby(['date_id', 'make_name']).agg( + unique_leads = ('lead_id', 'nunique'), + unique_partners = ('partner_id', 'nunique') + ).reset_index() + return df \ No newline at end of file diff --git a/152_actors_and_directors_cooperated.py b/152_actors_and_directors_cooperated.py new file mode 100644 index 0000000..84d31f4 --- /dev/null +++ b/152_actors_and_directors_cooperated.py @@ -0,0 +1,10 @@ +# Group the DF based on actor and director ID pairs and return the number of times they cooperated. Return +# the DF where the count is greater than or equal to 3. + +import pandas as pd + +def actors_and_directors(actor_director: pd.DataFrame) -> pd.DataFrame: + df = actor_director.groupby(['actor_id', 'director_id']).agg( + count = ('timestamp', 'count') + ).reset_index() + return df[df['count'] >= 3][['actor_id', 'director_id']] \ No newline at end of file