Skip to content

Commit

Permalink
Refactor environment (#70)
Browse files Browse the repository at this point in the history
* created the environment and inserted the split and filter methods, in addition to refactoring the DatasetLoader.py and dataset.py modules

* fixed random split

* removed comments

* refactor DefaultDatasetLoader.py and dataset.py, created folder for filters and splits

* Set setup and make

* Saving progress in the enviornment refactor

* Implement loader

* Finish enviornment implementation

* Fix loader and split-base

* Adding loaders to the iRec

* Delete .idea directory

* environment integration: environment/load environment/split and environment/filter

* environment integration: environment/load environment/split and environment/filter

* finished the integration of the fixed and updated train-test load registry.py

* fixed num_total_users/items

* fixed imports

* Fix validation

* A simple example of tests

* fixed return train test dataset

* added documentation for load module

* fixed assert

* Added docstrings and type hints

* Added docstrings and typehints

* Update docs for dataset.py and fixed warnings

* Fix simple returns

* Fix bugs

* Add bdd tests

* updated requirements

* removed unit test

* refactor: removed idea directory

* refactor: removed unnecessary Makefile

* fixed erros in yaml

* Update InteractionMetricEvaluator.py

* remove: traitlets dependency in run_agent

* feat: dev requirements included behave

* remove: redundant setup file

* refactor: removed all app branch changes

Co-authored-by: thiagodks <thiagoadriano2010@gmail.com>
Co-authored-by: Nicollas Silva <ncsilvaa@Nicollass-MBP.lan>
Co-authored-by: Thiago Silva <48692251+thiagodks@users.noreply.github.com>
Co-authored-by: Carlos Mito <carlosmsmito@gmail.com>
Co-authored-by: heitor57 <heitorwerneck@hotmail.com>
  • Loading branch information
6 people authored Apr 8, 2022
1 parent 51f499e commit 74bdc28
Show file tree
Hide file tree
Showing 31 changed files with 988 additions and 763 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ data/
*.aux
*.log
*.csv
.idea/

.vim/coc-settings.json
# Byte-compiled / optimized / DLL files
Expand Down
71 changes: 25 additions & 46 deletions irec/app/utils.py → irec/connector/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import json
from collections import defaultdict
from pathlib import Path
from irec.utils.dataset import TrainTestDataset
from irec.environment.dataset import Dataset
import collections
from app import constants
import matplotlib.ticker as mtick
Expand Down Expand Up @@ -217,7 +217,7 @@ def load_settings(workdir):
# with open(
# workdir + sep + "settings" + sep +
# "datasets_preprocessors_parameters.yaml") as f:
# d['datasets_preprocessors_parameters'] = yaml.load(f, Loader=loader)
# d['datasets_preprocessors_parameters'] = yaml.loader(f, Loader=loader)
# d['datasets_preprocessors_parameters'] = {
# k: {
# **setting,
Expand Down Expand Up @@ -270,7 +270,8 @@ def get_experiment_run_id(dm, evaluation_policy, itr_id):

def run_interactor(
agent,
traintest_dataset: TrainTestDataset,
train_dataset: Dataset,
test_dataset: Dataset,
evaluation_policy: EvaluationPolicy,
settings,
forced_run,
Expand All @@ -291,31 +292,19 @@ def run_interactor(
log_custom_parameters(get_agent_run_parameters(settings))

interactions, acts_info = evaluation_policy.evaluate(
agent, traintest_dataset.train, traintest_dataset.test
agent, train_dataset, test_dataset
)

fname = "./tmp/interactions.pickle"
log_custom_artifact(fname, interactions)
fname = "./tmp/acts_info.pickle"
log_custom_artifact(fname, acts_info)
# create_path_to_file(fname)
# with open(fname,mode='wb') as f:
# pickle.dump(history_items_recommended,f)
# mlflow.log_artifact(f.name)


def get_agent_id(agent_name, agent_parameters):
# agent_dict = class2dict(agent)
return agent_name + "_" + json.dumps(agent_parameters, separators=(",", ":"))


# def get_agent_id(agent, template_parameters):
# agent_dict = class2dict(agent)
# new_agent_settings = update_nested_dict(template_parameters, agent_dict)
# return agent.name + '_' + json.dumps(new_agent_settings,
# separators=(',', ':'))


def get_agent_id_from_settings(agent, settings):
agent_settings = next(
gen_dict_extract(agent.name, settings["agents_preprocessor_parameters"])
Expand Down Expand Up @@ -471,28 +460,20 @@ def load_dataset_experiment(settings):
)

client = MlflowClient()
artifact_path = client.download_artifacts(run.info.run_id, "dataset.pickle")
traintest_dataset = pickle.load(open(artifact_path, "rb"))
return traintest_dataset

train_artifact_path = client.download_artifacts(run.info.run_id, "train_dataset.pickle")
test_artifact_path = client.download_artifacts(run.info.run_id, "test_dataset.pickle")
train_dataset = pickle.load(open(train_artifact_path, "rb"))
test_dataset = pickle.load(open(test_artifact_path, "rb"))
return train_dataset, test_dataset

def run_agent(traintest_dataset, settings, forced_run):

# dataset_loader_parameters = settings["dataset_loaders"][
# settings["defaults"]["dataset_loader"]
# ]
def run_agent(train_dataset, test_dataset, settings, forced_run):

evaluation_policy_name = settings["defaults"]["evaluation_policy"]
evaluation_policy_parameters = settings["evaluation_policies"][
evaluation_policy_name
]

# exec("import irec.value_functions.{}".format(value_function_name))
# value_function = eval(
# "irec.value_functions.{}.{}".format(
# value_function_name, value_function_name
# )
# )(**value_function_parameters)
exec(
f"from irec.evaluation_policies.{evaluation_policy_name} import {evaluation_policy_name}"
)
Expand All @@ -504,7 +485,8 @@ def run_agent(traintest_dataset, settings, forced_run):
agent = AgentFactory().create(settings["defaults"]["agent"], agent_parameters)
run_interactor(
agent=agent,
traintest_dataset=traintest_dataset,
train_dataset=train_dataset,
test_dataset=test_dataset,
evaluation_policy=evaluation_policy,
settings=settings,
forced_run=forced_run,
Expand Down Expand Up @@ -737,20 +719,16 @@ def generate_base(dataset_name, settings):
dataset_loader_settings,
)
)
# client.log_param()
# for k,v in dataset_loader_settings.items():
# log_param(k,v)

from irec.utils.Factory import DatasetLoaderFactory

dataset_loader_factory = DatasetLoaderFactory()
dataset_loader = dataset_loader_factory.create(
dataset_name, dataset_loader_settings
)
dataset = dataset_loader.load()

fname = "./tmp/dataset.pickle"
log_custom_artifact(fname, dataset)
train_dataset, test_dataset = dataset_loader.process()
log_custom_artifact("./tmp/train_dataset.pickle", train_dataset)
log_custom_artifact("./tmp/test_dataset.pickle", test_dataset)


def download_data(dataset_names):
Expand Down Expand Up @@ -804,7 +782,7 @@ def run_agent_with_dataset_parameters(
for dataset_loader_name in dataset_loaders:
current_settings = settings
current_settings["defaults"]["dataset_loader"] = dataset_loader_name
traintest_dataset = load_dataset_experiment(settings)
train_dataset, test_dataset = load_dataset_experiment(settings)
for agent_name in agents:
current_settings["defaults"]["agent"] = agent_name
current_settings["agents"][agent_name] = dataset_agents_parameters[
Expand All @@ -813,15 +791,16 @@ def run_agent_with_dataset_parameters(
if tasks>1:
f = executor.submit(
run_agent,
traintest_dataset,
train_dataset,
test_dataset,
copy.deepcopy(current_settings),
forced_run,
)
futures.add(f)
if len(futures) >= tasks:
completed, futures = wait(futures, return_when=FIRST_COMPLETED)
else:
run_agent(traintest_dataset,copy.deepcopy(current_settings),forced_run)
run_agent(train_dataset, test_dataset,copy.deepcopy(current_settings),forced_run)

for f in futures:
f.result()
Expand Down Expand Up @@ -1273,22 +1252,22 @@ def evaluate_agent_with_dataset_parameters(
):

from concurrent.futures import ProcessPoolExecutor, wait, FIRST_COMPLETED
from irec.utils.dataset import Dataset

with ProcessPoolExecutor(max_workers=tasks) as executor:
futures = set()
for dataset_loader_name in dataset_loaders:
settings["defaults"]["dataset_loader"] = dataset_loader_name

traintest_dataset = load_dataset_experiment(settings)
train_dataset, test_dataset = load_dataset_experiment(settings)

data = np.vstack(
(traintest_dataset.train.data, traintest_dataset.test.data)
(train_dataset.data, test_dataset.data)
)

dataset = Dataset(data)
dataset.update_from_data()
dataset.set_parameters()
dataset.update_num_total_users_items()

for agent_name in agents:
settings["defaults"]["agent"] = agent_name
settings["agents"][agent_name] = dataset_agents_parameters[
Expand Down Expand Up @@ -1347,7 +1326,7 @@ def eval_agent_search(
data = np.vstack((traintest.train.data, traintest.test.data))
dataset = copy.copy(traintest.train)
dataset.data = data
dataset.update_from_data()
dataset.set_parameters()
for agent_name in agents:
settings["defaults"]["agent"] = agent_name
for agent_og_parameters in agents_search_parameters[agent_name]:
Expand Down
Empty file added irec/environment/__init__.py
Empty file.
80 changes: 80 additions & 0 deletions irec/environment/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from typing import List
import numpy as np


class Dataset:

num_users = 0
num_items = 0
rate_domain = set()
max_uid = 0
max_iid = 0
mean_rating = 0
min_rating = 0
max_rating = 0

def __init__(
self,
data: np.ndarray
):
"""__init__
Args:
data (np.ndarray): the data
"""
self.data = data
self.num_total_users = 0
self.num_total_items = 0

@staticmethod
def normalize_ids(ids: List) -> np.array:
"""normalize_ids
normalizes the ids by putting them in sequence
Args:
ids (List): list of ids
Returns:
result (np.array): the normalized ids
"""
unique_values = np.sort(np.unique(ids))
result = np.searchsorted(unique_values, ids)
return result

def reset_index(self):
"""reset_index
Resets user and item indices
"""
self.data[:, 0] = self.normalize_ids(self.data[:, 0])
self.data[:, 1] = self.normalize_ids(self.data[:, 1])

def set_parameters(self):

"""set_parameters
Calculates and updates the database parameters
"""
self.num_users = len(np.unique(self.data[:, 0]))
self.num_items = len(np.unique(self.data[:, 1]))
self.rate_domain = set(np.unique(self.data[:, 2]))
self.uids = np.unique(self.data[:, 0]).astype(int)
self.iids = np.unique(self.data[:, 1]).astype(int)
self.max_uid = np.max(self.uids)
self.max_iid = np.max(self.iids)
self.mean_rating = np.mean(self.data[:, 2])
self.min_rating = np.min(self.data[:, 2])
self.max_rating = np.max(self.data[:, 2])

def update_num_total_users_items(self, num_total_users=0, num_total_items=0):
"""update_num_total_users_items
Updates the total number of users and items
"""
self.num_total_users = num_total_users if num_total_users > self.max_uid+1 else self.max_uid+1
self.num_total_items = num_total_items if num_total_items > self.max_iid+1 else self.max_iid+1
47 changes: 47 additions & 0 deletions irec/environment/filter/filtering_by_items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import random
from pandas import DataFrame


class FilteringByItems:
"""FilteringByItems.
This class contains different filtering by item approaches.
"""

def __init__(self):
pass

@staticmethod
def min_ratings(df_dataset: DataFrame, min_ratings: int) -> DataFrame:
"""min_ratings.
This function removes items whose total number of
ratings is less than [min_ratings].
Args:
df_dataset (DataFrame): the data to be filtered.
min_ratings (int): minimum number of ratings.
Returns:
The data filtered by the number of ratings.
"""
selected_items = dict(
df_dataset.groupby("itemId")["userId"].agg("count")[
lambda ratings: ratings >= min_ratings
]
)
return df_dataset[df_dataset["itemId"].isin(selected_items)]

@staticmethod
def num_items(df_dataset: DataFrame, num_items: int) -> DataFrame:
"""num_items.
This function limits the number of distinct items in the dataset.
Args:
df_dataset (DataFrame): the data to be filtered.
num_items (int): maximum number of items in the dataset.
Returns:
The data filtered by the number of items.
"""
try:
selected_items = random.sample(list(df_dataset["itemId"].unique()), num_items)
except:
return df_dataset
return df_dataset[df_dataset["itemId"].isin(selected_items)]
47 changes: 47 additions & 0 deletions irec/environment/filter/filtering_by_users.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import random
from pandas import DataFrame


class FilteringByUsers:
"""FilteringByUsers.
This class contains different filtering by users approaches.
"""

def __init__(self):
pass

@staticmethod
def min_consumption(df_dataset: DataFrame, min_consumption: int) -> DataFrame:
"""min_consumption.
This function removes users whose total number of
consumptions is less than [min_consumption].
Args:
df_dataset (DataFrame): the data to be filtered.
min_consumption (int): minimum number of items consumed by a user.
Returns:
The data filtered by the number of consumptions.
"""
selected_users = dict(
df_dataset.groupby("userId")["itemId"].agg("count")[
lambda consumption: consumption >= min_consumption
]
)
return df_dataset[df_dataset["userId"].isin(selected_users)]

@staticmethod
def num_users(df_dataset: DataFrame, num_users: int) -> DataFrame:
"""num_users.
This function limits the number of distinct users in the dataset.
Args:
df_dataset (DataFrame): the data to be filtered.
num_users (int): maximum number of users in the dataset.
Returns:
The data filtered by the number of users.
"""
try:
selected_users = random.sample(list(df_dataset["userId"].unique()), num_users)
except:
return df_dataset
return df_dataset[df_dataset["userId"].isin(selected_users)]
Loading

0 comments on commit 74bdc28

Please sign in to comment.