Skip to content

Commit

Permalink
v4.5.0
Browse files Browse the repository at this point in the history
- Adjusted data transformation process in graphql_scraper.py
  • Loading branch information
sakan811 committed Jun 18, 2024
1 parent e362510 commit 0ce0951
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 3 deletions.
9 changes: 6 additions & 3 deletions japan_avg_hotel_price_finder/graphql_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,13 +535,16 @@ def transform_data_in_df(check_in, city, dataframe) -> pd.DataFrame:
logger.info("Remove duplicate rows from the DataFrame based on 'Hotel' column")
df_filtered = dataframe.drop_duplicates(subset='Hotel').copy()

logger.info("Convert columns to numeric values")
df_filtered.loc[:, 'Price'] = pd.to_numeric(df_filtered['Price'], errors='coerce')
df_filtered.loc[:, 'Review'] = pd.to_numeric(df_filtered['Review'], errors='coerce')

# Drop rows where any of the 'Hotel', 'Review', 'Price' columns are None or NaN
logger.info("Dropping rows where 'Hotel', 'Review', or 'Price' columns are None or NaN")
df_filtered = df_filtered.dropna(subset=['Hotel', 'Review', 'Price'])

logger.info("Convert columns to numeric values")
df_filtered.loc[:, 'Price'] = pd.to_numeric(df_filtered['Price'], errors='coerce')
df_filtered.loc[:, 'Review'] = pd.to_numeric(df_filtered['Review'], errors='coerce')
logger.info("Dropping rows where 'Review', or 'Price' columns are 0")
df_filtered = df_filtered[(df_filtered['Price'] != 0) & (df_filtered['Review'] != 0)]

logger.info("Calculate the Price/Review ratio")
df_filtered.loc[:, 'Price/Review'] = df_filtered['Price'] / df_filtered['Review']
Expand Down
17 changes: 17 additions & 0 deletions tests/test_graphql_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,23 @@ def test_transform_data_in_df_dropna():
assert len(result_df) == 1 # Only 'Hotel A'


def test_drop_rows_with_zero_price_or_review():
# Given
data = {
'Hotel': ['Hotel A', 'Hotel B', 'Hotel C'],
'Review': [4.0, 0, 5.0],
'Price': [200, 0, 250]
}
df = pd.DataFrame(data)

# When
result_df = transform_data_in_df('2024-06-17', 'Tokyo', df)

# Then
assert len(result_df) == 2 # Only 'Hotel A' and 'Hotel C' should remain
assert 'Hotel B' not in result_df['Hotel'].values # 'Hotel B' with 0 price should be dropped


def test_transform_data_in_df_calculation():
# Create a sample DataFrame
data = {
Expand Down

0 comments on commit 0ce0951

Please sign in to comment.