Skip to content

Commit

Permalink
update: add junda data
Browse files Browse the repository at this point in the history
  • Loading branch information
terryyz authored May 8, 2024
2 parents 29c877a + 066b0ce commit b379271
Show file tree
Hide file tree
Showing 79 changed files with 8,961 additions and 1 deletion.
167 changes: 167 additions & 0 deletions data/raw/f_1706_junda_james.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import pandas as pd
import numpy as np
import itertools
from datetime import datetime, timedelta
import seaborn as sns

def f_1706(df, fruits=None, days=None, seed=None, sales_lower_bound=1, sales_upper_bound=50):
"""
Appends randomly generated sales data for specified fruits over a given range of days to a DataFrame,
and returns a seaborn boxplot of the sales.
Parameters:
- df (pd.DataFrame): Initial Empty DataFrame to append sales data to. Must be empty.
- fruits (List[str], optional): List of fruits for sales data. Defaults to ['Apple', 'Banana', 'Cherry', 'Date', 'Elderberry'].
- days (List[datetime], optional): List of days for sales data. Defaults to the range from January 1, 2024, to January 7, 2024.
- seed (int, optional): Seed for the random number generator. Defaults to None.
- sales_lower_bound (int, optional): Lower bound for random sales values. Defaults to 1.
- sales_upper_bound (int, optional): Upper bound for random sales values. Defaults to 50.
Returns:
Tuple[pd.DataFrame, sns.axisgrid.FacetGrid]: Updated DataFrame with sales data and a seaborn boxplot of the sales.
Raises:
TypeError: If 'df' is not a pandas DataFrame.
ValueError: If 'df' is not empty or If 'sales_lower_bound' is not less than 'sales_upper_bound'.
Requirements:
- pandas
- numpy
- itertools
- datetime
- seaborn
Example:
>>> initial_df = pd.DataFrame()
>>> report_df, plot = f_1706(initial_df, seed=42)
>>> print(report_df.head())
Fruit Day Sales
0 Apple 2024-01-01 39
1 Apple 2024-01-02 29
2 Apple 2024-01-03 15
3 Apple 2024-01-04 43
4 Apple 2024-01-05 8
>>> plot.figure.show()
"""
if not isinstance(df, pd.DataFrame):
raise TypeError("Input must be a pandas DataFrame")
if not df.empty:
raise ValueError("Input DataFrame must be empty")
if sales_lower_bound >= sales_upper_bound:
raise ValueError("sales_lower_bound must be less than sales_upper_bound")

if fruits is None:
fruits = ['Apple', 'Banana', 'Cherry', 'Date', 'Elderberry']
if days is None:
# Set days to range from January 1, 2024, to January 7, 2024
days = [datetime(2024, 1, 1) + timedelta(days=x) for x in range(7)]

if seed is not None:
np.random.seed(seed)

data = list(itertools.product(fruits, days))
sales_data = pd.DataFrame(data, columns=['Fruit', 'Day'])
sales_data['Sales'] = np.random.randint(sales_lower_bound, sales_upper_bound, size=len(data))

result_df = pd.concat([df, sales_data])
plot = sns.boxplot(x='Fruit', y='Sales', data=result_df)

return result_df, plot

import unittest
import pandas as pd
import numpy as np
from datetime import datetime

class TestCases(unittest.TestCase):
def setUp(self):
# Define the default date range for comparison in tests
self.default_days = [datetime(2024, 1, 1) + timedelta(days=x) for x in range(7)]

def test_default_days_range(self):
"""Test the default days range is correctly applied."""
initial_df = pd.DataFrame()
report_df, _ = f_1706(initial_df, seed=42)
unique_days = sorted(report_df['Day'].dt.date.unique())
expected_days = [day.date() for day in self.default_days]
self.assertEqual(len(unique_days), len(expected_days), "The number of unique days should match the default range.")
for day in unique_days:
self.assertIn(day, expected_days, "Each unique day should be within the default range.")

def test_custom_days_range(self):
"""Test functionality with a custom days range."""
initial_df = pd.DataFrame()
custom_days = [datetime(2024, 1, 10), datetime(2024, 1, 11)]
report_df, _ = f_1706(initial_df, days=custom_days, seed=42)
unique_days = sorted(report_df['Day'].dt.date.unique())
expected_custom_days = [day.date() for day in custom_days]
self.assertEqual(len(unique_days), len(expected_custom_days), "The number of unique days should match the custom range.")
for day in unique_days:
self.assertIn(day, expected_custom_days, "Each unique day should be within the custom range.")


def test_sales_bounds(self):
"""Test custom sales bounds are respected."""
initial_df = pd.DataFrame()
report_df, _ = f_1706(initial_df, seed=42, sales_lower_bound=20, sales_upper_bound=30)
sales_values = report_df['Sales'].unique()
self.assertTrue(all(20 <= val < 30 for val in sales_values), "All sales values should be within the specified bounds.")

def test_invalid_sales_bounds(self):
"""Test error handling for invalid sales bounds."""
with self.assertRaises(ValueError):
f_1706(pd.DataFrame(), sales_lower_bound=50, sales_upper_bound=10)

def test_with_non_dataframe_input(self):
"""Test that providing a non-DataFrame input raises a TypeError."""
with self.assertRaises(TypeError):
f_1706("not_a_dataframe")

def test_reproducibility_with_seed(self):
"""Test reproducibility of sales data generation with a fixed seed."""
initial_df = pd.DataFrame()
df1, _ = f_1706(initial_df, seed=42)
df2, _ = f_1706(initial_df, seed=42)
pd.testing.assert_frame_equal(df1, df2, "DataFrames generated with the same seed should be identical.")

def test_with_custom_fruits_and_days(self):
fruits = ['Mango', 'Pineapple']
days = [pd.Timestamp('2023-01-01'), pd.Timestamp('2023-01-02')]
initial_df = pd.DataFrame()
report_df, plot = f_1706(initial_df, fruits=fruits, days=days, sales_lower_bound=1, sales_upper_bound=50, seed=42)

self.assertEqual(len(report_df['Fruit'].unique()), len(fruits), "Number of unique fruits should match the input")
self.assertEqual(len(report_df['Day'].unique()), len(days), "Number of unique days should match the input")
self.assertTrue(hasattr(plot, 'figure'), "Plot object should have a 'figure' attribute")

# Convert DataFrame to a list of strings for each row
df_list = report_df.apply(lambda row: ','.join(row.values.astype(str)), axis=1).tolist()

# Check if the converted list matches the expected output
expect_output = ['Mango,2023-01-01 00:00:00,39', 'Mango,2023-01-02 00:00:00,29', 'Pineapple,2023-01-01 00:00:00,15', 'Pineapple,2023-01-02 00:00:00,43']
self.assertAlmostEqual(df_list, expect_output, "DataFrame contents should match the expected output")

def test_error_on_non_empty_dataframe(self):
"""Test that a ValueError is raised if the input DataFrame is not empty."""
# Create a non-empty DataFrame
non_empty_df = pd.DataFrame({'A': [1, 2, 3]})

# Attempt to call f_1706 with a non-empty DataFrame and check for ValueError
with self.assertRaises(ValueError) as context:
f_1706(non_empty_df, seed=42)

# Optionally, check the error message to ensure it's for the non-empty DataFrame condition
self.assertTrue("Input DataFrame must be empty" in str(context.exception), "Function should raise ValueError for non-empty DataFrame input.")

def run_tests():
"""Run all tests for this function."""
loader = unittest.TestLoader()
suite = loader.loadTestsFromTestCase(TestCases)
runner = unittest.TextTestRunner()
runner.run(suite)

if __name__ == "__main__":
import doctest
doctest.testmod()
run_tests()
145 changes: 145 additions & 0 deletions data/raw/f_1718_junda_james.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import numpy as np
import pandas as pd

def f_1718(products, n_samples=100, sales_lower=50, sales_upper=200, profit_margin_min=0.1, profit_margin_max=0.5, random_seed=42):
"""
Generate a sales report with randomly simulated sales and profit data for a given list of products.
The data is aggregated by product and sorted by total profit in descending order.
Parameters:
- products (list of str): List of product names.
- n_samples (int): The number of data points to generate for the report. Default is 100.
- sales_lower (int): The minimum sales value for the random generation. Default is 50.
- sales_upper (int): The maximum sales value for the random generation. Default is 200.
- profit_margin_min (float): The minimum profit margin as a fraction of sales. Default is 0.1.
- profit_margin_max (float): The maximum profit margin as a fraction of sales. Default is 0.5.
- random_seed (int): Seed for the random number generator to ensure reproducibility. Default is 42.
Returns:
pd.DataFrame: A DataFrame containing aggregated sales and profit data for each product, sorted by profit.
Raises:
ValueError: If n_samples is not a positive integer, or if sales_lower is greater than sales_upper.
TypeError: If products is not a list of strings, or if sales_lower, sales_upper, profit_margin_min, or profit_margin_max are not numeric.
Requirements:
- numpy
- pandas
Example:
>>> products = ["iPhone", "iPad", "Macbook", "Airpods", "Apple Watch"]
>>> report = f_1718(products, n_samples=50, sales_lower=100, sales_upper=150, profit_margin_min=0.2, profit_margin_max=0.4, random_seed=42)
>>> print(report)
Product Sales Profit
2 Macbook 1561 444.826709
3 iPad 1383 401.925334
0 Airpods 1297 381.482713
1 Apple Watch 1123 308.078536
4 iPhone 921 294.013887
"""
np.random.seed(random_seed)

if not products:
return pd.DataFrame(columns=["Product", "Sales", "Profit"])

if not isinstance(products, list) or not all(isinstance(product, str) for product in products):
raise TypeError("products must be a list of strings.")
if not isinstance(n_samples, int) or n_samples <= 0:
raise ValueError("n_samples must be a positive integer.")
if not (isinstance(sales_lower, int) and isinstance(sales_upper, int)) or sales_lower >= sales_upper:
raise ValueError("sales_lower must be less than sales_upper and both must be integers.")
if not all(isinstance(x, (int, float)) for x in [profit_margin_min, profit_margin_max]) or profit_margin_min >= profit_margin_max:
raise ValueError("profit_margin_min must be less than profit_margin_max and both must be numeric.")

data = []
for _ in range(n_samples):
product = np.random.choice(products)
sales = np.random.randint(sales_lower, sales_upper + 1)
profit = sales * np.random.uniform(profit_margin_min, profit_margin_max)
data.append([product, sales, profit])

df = pd.DataFrame(data, columns=["Product", "Sales", "Profit"])
df = df.groupby("Product", as_index=False).sum()
df.sort_values("Profit", ascending=False, inplace=True)

return df

import pandas as pd
import unittest

class TestCases(unittest.TestCase):

def test_random_reproducibility(self):
report1 = f_1718(["iPhone", "iPad"], n_samples=50, sales_lower=50, sales_upper=200, profit_margin_min=0.1, profit_margin_max=0.5, random_seed=42)
report2 = f_1718(["iPhone", "iPad"], n_samples=50, sales_lower=50, sales_upper=200, profit_margin_min=0.1, profit_margin_max=0.5, random_seed=42)
pd.testing.assert_frame_equal(report1, report2)

def test_number_of_rows(self):
report = f_1718(["iPhone", "iPad"], n_samples=50, sales_lower=50, sales_upper=200)
self.assertEqual(len(report), len(set(["iPhone", "iPad"])))

def test_sorting_by_profit(self):
report = f_1718(["iPhone", "iPad"], sales_lower=50, sales_upper=200)
self.assertTrue(report["Profit"].is_monotonic_decreasing)

def test_custom_parameters(self):
report = f_1718(["iPhone", "iPad", "Macbook", "Airpods", "Apple Watch"], n_samples=50, sales_lower=100, sales_upper=150, profit_margin_min=0.2, profit_margin_max=0.4, random_seed=42)
# This test needs to be adjusted based on the expected outcome of the custom parameters.
# Specific checks on DataFrame contents should account for the randomness and reproducibility aspects.
self.assertTrue(len(report) > 0, "The report should contain aggregated sales and profit data.")

def test_new_custom_parameters(self):
report1 = f_1718(["iPhone", "iPad", "Macbook", "Airpods", "Apple Watch"], n_samples=50, sales_lower=100, sales_upper=150, profit_margin_min=0.2, profit_margin_max=0.4, random_seed=42)
df_list = report1.apply(lambda row: ','.join(row.values.astype(str)), axis=1).tolist()
expect = ['Macbook,1561,444.82670855378143', 'iPad,1383,401.9253335536443', 'Airpods,1297,381.4827132170069', 'Apple Watch,1123,308.07853599252707', 'iPhone,921,294.0138866107959']

self.assertEqual(df_list, expect, "DataFrame contents should match the expected output")

def test_sales_bounds_validation(self):
"""Test that an error is raised if sales_lower is greater than sales_upper."""
with self.assertRaises(ValueError):
f_1718(["Product1"], sales_lower=250, sales_upper=100)

def test_profit_margin_validation(self):
"""Test that an error is raised if profit_margin_min is greater than or equal to profit_margin_max."""
with self.assertRaises(ValueError):
f_1718(["Product1"], profit_margin_min=0.6, profit_margin_max=0.5)

def test_product_list_validation(self):
"""Test that an error is raised if the products list is not a list of strings."""
with self.assertRaises(TypeError):
f_1718([123, 456], n_samples=10)

def test_n_samples_validation(self):
"""Test that an error is raised if n_samples is not a positive integer."""
with self.assertRaises(ValueError):
f_1718(["Product1"], n_samples=-10)

def test_empty_product_list(self):
"""Test that the function can handle an empty product list."""
report = f_1718([], n_samples=10)
self.assertTrue(report.empty, "The report should be empty if no products are provided.")

def test_zero_samples(self):
"""Test handling of zero samples."""
with self.assertRaises(ValueError):
f_1718(["Product1"], n_samples=-10)

def test_single_product_reproducibility(self):
"""Test that the function generates consistent results for a single product across multiple runs."""
report1 = f_1718(["Product1"], n_samples=10, random_seed=42)
report2 = f_1718(["Product1"], n_samples=10, random_seed=42)
pd.testing.assert_frame_equal(report1, report2)


def run_tests():
"""Run all tests for this function."""
loader = unittest.TestLoader()
suite = loader.loadTestsFromTestCase(TestCases)
runner = unittest.TextTestRunner()
runner.run(suite)

if __name__ == "__main__":
import doctest
doctest.testmod()
run_tests()
Loading

0 comments on commit b379271

Please sign in to comment.