From b3e7aaadadca9eeeb951de4f5bd9610552cc0186 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Tue, 14 Feb 2023 15:50:31 +0800 Subject: [PATCH 1/9] Drop old data_ppy_sh_to_csv submod --- .gitmodules | 3 --- opal/data_ppy_sh_to_csv | 1 - 2 files changed, 4 deletions(-) delete mode 160000 opal/data_ppy_sh_to_csv diff --git a/.gitmodules b/.gitmodules index 2edf3d2d..e69de29b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "data_ppy_sh_to_csv"] - path = opal/data_ppy_sh_to_csv - url = https://github.com/Eve-ning/data_ppy_sh_to_csv.git diff --git a/opal/data_ppy_sh_to_csv b/opal/data_ppy_sh_to_csv deleted file mode 160000 index 41901131..00000000 --- a/opal/data_ppy_sh_to_csv +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 419011310dc2b849417975d277964e2849f0177f From 32443a331eae82587ed2255885b509f656d51be8 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Tue, 14 Feb 2023 15:50:51 +0800 Subject: [PATCH 2/9] Integrate new osu-data-csv module into score_datamodule.py --- opal/score/datamodule/score_datamodule.py | 26 +++++++++++------------ 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/opal/score/datamodule/score_datamodule.py b/opal/score/datamodule/score_datamodule.py index bda159c1..9a64fa1d 100644 --- a/opal/score/datamodule/score_datamodule.py +++ b/opal/score/datamodule/score_datamodule.py @@ -1,18 +1,18 @@ import logging -from dataclasses import dataclass, field -from typing import Sequence, Tuple - import numpy as np import pandas as pd import pytorch_lightning as pl import torch +from dataclasses import dataclass, field +from pathlib import Path from sklearn.base import TransformerMixin from sklearn.preprocessing import LabelEncoder, QuantileTransformer from torch.utils.data import DataLoader, TensorDataset, random_split +from typing import Sequence, Tuple from opal.conf.conf import DATA_DIR from opal.conf.mods import OsuMod -from opal.data_ppy_sh_to_csv.main import get_dataset, default_sql_names +from osu_data_csv.main import get_dataset @dataclass @@ -50,18 +50,16 @@ def __post_init__(self): def prepare_data(self) -> None: """ Downloads data via data_ppy_sh_to_csv submodule """ get_dataset( - self.ds_yyyy_mm, # year_month= - self.ds_mode, # mode= - self.ds_set, # set= - DATA_DIR, # dl_dir= - 'Y', # bypass_confirm= - ",".join(default_sql_names[:4]), # sql_names= - 'N', # cleanup= - 'N' # zip_csv_files= + year_month=self.ds_yyyy_mm, + mode=self.ds_mode, + set=self.ds_set, + dl_dir=DATA_DIR, + bypass_confirm='Y', + cleanup='N', + ignore_path=(Path(__file__).parent / "ignore_mapping.yaml").as_posix() ) def setup(self, stage: str = "") -> None: - self.prepare_data() ds_str = f"{self.ds_yyyy_mm}_01_performance_{self.ds_mode}_top_{self.ds_set}" csv_dir = DATA_DIR / ds_str / "csv" @@ -175,7 +173,7 @@ def prep_map(df: pd.DataFrame, (df['playmode'] == 3) & (df['diff_size'].isin(diff_sizes)) & (df['difficultyrating'].between(*sr_bounds)), - ['difficultyrating', 'diff_overall', 'diff_size', 'version', 'beatmap_id', 'filename'] + ['difficultyrating', 'diff_overall', 'diff_size', 'beatmap_id', 'filename'] ] return df From 149beb01b0571e4584daa47f47acd82a6dc29c2d Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Tue, 14 Feb 2023 15:53:07 +0800 Subject: [PATCH 3/9] Remove unused req --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 12b2cdbd..cbb734be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,3 @@ -reamber==0.1.6 scikit-learn -junit_xml pandas pytorch-lightning From 3c2b12dc82ccade4abb3cb0eda3edb342adf07e3 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Tue, 14 Feb 2023 15:53:52 +0800 Subject: [PATCH 4/9] Remove unused utils --- opal/utils/__init__.py | 4 - opal/utils/ecdf.py | 9 --- opal/utils/load_map.py | 178 ----------------------------------------- 3 files changed, 191 deletions(-) delete mode 100644 opal/utils/__init__.py delete mode 100644 opal/utils/ecdf.py delete mode 100644 opal/utils/load_map.py diff --git a/opal/utils/__init__.py b/opal/utils/__init__.py deleted file mode 100644 index d421fec4..00000000 --- a/opal/utils/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .ecdf import ecdf -from .load_map import load_replay, load_replays - -__all__ = ['load_replay', 'load_replays'] diff --git a/opal/utils/ecdf.py b/opal/utils/ecdf.py deleted file mode 100644 index fbd7bf87..00000000 --- a/opal/utils/ecdf.py +++ /dev/null @@ -1,9 +0,0 @@ -import pandas as pd - - -def ecdf(x: pd.Series): - counts = x.value_counts() - x = counts.sort_index().cumsum() / len(counts) - x.index = x.index.get_level_values(0) - x /= x.max() - return x \ No newline at end of file diff --git a/opal/utils/load_map.py b/opal/utils/load_map.py deleted file mode 100644 index 6c29ccc4..00000000 --- a/opal/utils/load_map.py +++ /dev/null @@ -1,178 +0,0 @@ -from __future__ import annotations - -import pickle -from copy import deepcopy -from dataclasses import dataclass -from pathlib import Path -from typing import List, Tuple, Generator, Sequence - -import numpy as np -import pandas as pd -from reamber.algorithms.osu.OsuReplayError import osu_replay_error -from reamber.algorithms.pattern import Pattern -from reamber.base.Hold import HoldTail -from reamber.base.lists.notes.NoteList import NoteList -from reamber.osu.OsuHit import OsuHit -from reamber.osu.OsuHold import OsuHold -from reamber.osu.OsuMap import OsuMap - -from opal.conf import REPLAYS_DIR - - -def load_replays(cache_reset=False) -> Generator[ - Tuple[pd.DataFrame, OsuMap], None, None]: - for d in filter(lambda d_: d_.is_dir(), REPLAYS_DIR.iterdir()): - yield load_replay(d, cache_reset) - - -def load_replay(map_dir: Path, cache_reset=False) -> Tuple[pd.DataFrame, OsuMap]: - return ReplayLoader(map_dir).load(cache_reset) - - -@dataclass -class ReplayLoader: - map_dir: Path - cache_name: str = "cache.pkl" - - @staticmethod - def get_errors(osu: OsuMap, rep_paths: List[Path]) -> pd.DataFrame: - """ Gets the errors of the replays. - - Args: - rep_paths: Paths of the replays - osu: Path of the osu! map - """ - - # Filter bad replays - rep_paths = [p for p in rep_paths if p.stat().st_size > 1000] - - # Yield Replay Error - errors = osu_replay_error([r.as_posix() for r in rep_paths], osu) - - # Get map offsets regardless of type - # k_o key offsets - df_map_offset = pd.DataFrame.from_records( - [(o,) - for _, k_o in [*errors.map_offsets.hits.items(), - *errors.map_offsets.releases.items()] - for o in k_o], - columns=["offset"] - ) - - # Get replay errors as offset - df_errors = pd.DataFrame.from_records( - [(r_id, k, o) - for r_id, rep_offset in enumerate(errors.errors) - for k, k_o in [*rep_offset.hits.items(), - *rep_offset.releases.items()] - for o in k_o], - columns=["r_id", "column", "error"] - ) - - return pd.merge( - # We combine map offsets & error - # For n replays, we repeat the map offsets n times - pd.concat([df_map_offset] * len(rep_paths)).reset_index(), - df_errors, - left_index=True, - right_index=True - ).drop('index', axis=1).astype(int).assign( - # Absolute error - error=lambda x: x.error.abs() - ).drop(['r_id', 'column'], axis=1).groupby( - # Get the median error - ['offset'] - ).median().reset_index() - - @staticmethod - def get_pattern(nls: Sequence[NoteList]) -> pd.DataFrame: - """ Gets the pattern of the map. """ - - grps = Pattern.from_note_lists(nls).group() - - # Manually extract if columns are held - is_held = [] - grps_hold = [] - for grp in grps: - holds = [note.column for note in grp if note.type == OsuHold] - hits = [note.column for note in grp if note.type == OsuHit] - tails = [note.column for note in grp if note.type == HoldTail] - - grps_hold.append( - [np.min(grp.offset), [*hits, *holds], deepcopy(is_held)] - ) - is_held.extend(holds) - is_held = [c for c in is_held if c not in tails] - - df = pd.DataFrame(grps_hold, columns=["offset", "columns", "is_held"]) - - # OHE for bigram - df_cols = pd.get_dummies( - df['columns'].apply(pd.Series, dtype=int).stack()).groupby( - level=0 - ).sum() - df_cols.columns = [f'col_{c}' for c in range(len(df_cols.columns))] - - # OHE for held - df_hold = pd.get_dummies( - df['is_held'].apply(pd.Series, dtype=int).stack()).groupby( - level=0 - ).sum() - df_hold.columns = [f'is_held_{c}' for c in range(len(df_hold.columns))] - - return ( - pd.merge( - # Horizontally Join Offset & Cols - df['offset'], df_cols, how='left', left_index=True, - right_index=True - ).merge( - # Horizontally Join Offset, Cols & Held - df_hold, how='left', left_index=True, - right_index=True - ).fillna( - 0 - ).assign( - # Assign Delta to diff - diff=lambda x: x['offset'].diff().shift(-1) - )[:-1] - ) - - def load(self, cache_reset: bool = False) -> Tuple[pd.DataFrame, OsuMap]: - """ Prepare the data for the model """ - if not cache_reset and self.cache_exists: - return self.cache_get() - - else: - map_path = self.map_dir / (self.map_dir.name + ".osu") - osu = OsuMap.read_file(map_path.as_posix()) - rep_dir = self.map_dir / "rep" - rep_paths = [p for p in rep_dir.iterdir() if p.is_file()] - - data = pd.merge( - self.get_pattern([osu.hits, osu.holds]), - self.get_errors(osu, rep_paths), - how='left', - on='offset' - ).drop(['offset'], axis=1), osu - - self.cache_set(data) - - return data - - @property - def cache_path(self) -> Path: - return self.map_dir / self.cache_name - - def cache_set(self, data): - with open(self.cache_path, "wb+") as f: - pickle.dump(data, f) - - @property - def cache_exists(self) -> bool: - return self.cache_path.exists() - - def cache_get(self) -> Tuple[pd.DataFrame, OsuMap]: - with open(self.cache_path, "rb+") as f: - data = pickle.load(f) - - return data From e18b0a472edfe8b3f5fdfce641692cc21ac3164b Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Tue, 14 Feb 2023 15:54:14 +0800 Subject: [PATCH 5/9] Remove empty .gitmodules --- .gitmodules | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .gitmodules diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index e69de29b..00000000 From 0ecae0b6b3104a7bd8d4d0bfbdebf1301583d444 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Tue, 14 Feb 2023 15:56:18 +0800 Subject: [PATCH 6/9] Add basic pyproject.toml --- pyproject.toml | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index ec33f447..00000000 --- a/pyproject.toml +++ /dev/null @@ -1,5 +0,0 @@ -[tool.pytest.ini_options] -log_cli = true -log_cli_level = "INFO" -log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)" -log_cli_date_format = "%Y-%m-%d %H:%M:%S" \ No newline at end of file From 4ecaee3e1cb5c730ee1db3105fe62ff0c6ee3535 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Tue, 14 Feb 2023 15:56:35 +0800 Subject: [PATCH 7/9] Add ignore_mapping.yaml --- opal/score/datamodule/ignore_mapping.yaml | 92 +++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 opal/score/datamodule/ignore_mapping.yaml diff --git a/opal/score/datamodule/ignore_mapping.yaml b/opal/score/datamodule/ignore_mapping.yaml new file mode 100644 index 00000000..e74fb3a4 --- /dev/null +++ b/opal/score/datamodule/ignore_mapping.yaml @@ -0,0 +1,92 @@ +# This is the default ignore mapping. +# For every column that is NOT commented out, it'll be IGNORED when converting +# Try to ignore those that are not necessary for your use +# This will heavily reduce storage needed and increase processing speed +# Once you're done, reference this file when calling osu-data-csv via +# osu-data-csv -i path/to/ignore_mapping.yaml + +osu_beatmap_difficulty.sql: +# - beatmap_id +# - mode +# - mods +# - diff_unified + - last_update +osu_beatmaps.sql: +# - beatmap_id + - beatmapset_id + - user_id +# - filename + - checksum +# - version + - total_length + - hit_length +# - countTotal +# - countNormal +# - countSlider +# - countSpinner +# - diff_drain +# - diff_size +# - diff_overall +# - diff_approach +# - playmode +# - approved + - last_update +# - difficultyrating +# - playcount +# - passcount +# - youtube_preview +# - score_version +# - deleted_at + - bpm +osu_scores{mode}_high.sql: +# - score_id +# - beatmap_id +# - user_id +# - score + - maxcombo + - rank +# - count50 +# - count100 +# - count300 +# - countmiss +# - countgeki +# - countkatu + - perfect +# - enabled_mods +# - date +# - pp +# - replay + - hidden + - country_acronym +osu_user_stats{mode}.sql: +# - user_id + - count300 + - count100 + - count50 + - countMiss + - accuracy_total + - accuracy_count + - accuracy + - playcount + - ranked_score + - total_score + - x_rank_count + - xh_rank_count + - s_rank_count + - sh_rank_count + - a_rank_count + - rank + - level + - replay_popularity + - fail_count + - exit_count + - max_combo + - country_acronym +# - rank_score +# - rank_score_index + - rank_score_exp + - rank_score_index_exp + - accuracy_new + - last_update + - last_played + - total_seconds_played From 61b6c101d82f607b2e772d9121082e97b71397d7 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Tue, 14 Feb 2023 16:02:38 +0800 Subject: [PATCH 8/9] Remove unused replay package --- opal/conf/conf.py | 1 - opal/replay/__init__.py | 0 2 files changed, 1 deletion(-) delete mode 100644 opal/replay/__init__.py diff --git a/opal/conf/conf.py b/opal/conf/conf.py index f1416c6c..aa7ad96b 100644 --- a/opal/conf/conf.py +++ b/opal/conf/conf.py @@ -4,5 +4,4 @@ DATA_DIR = ROOT_DIR / "data/" OSU_DIR = DATA_DIR / "osu/" MODEL_DIR = ROOT_DIR / "models/" -REPLAYS_DIR = OSU_DIR / "replays" SCORES_DIR = OSU_DIR / "scores" diff --git a/opal/replay/__init__.py b/opal/replay/__init__.py deleted file mode 100644 index e69de29b..00000000 From aabac6f85bc611085da5fdced2318360399aa4b1 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Tue, 14 Feb 2023 16:09:47 +0800 Subject: [PATCH 9/9] Add poetry init --- pyproject.toml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..e76622bf --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[tool.poetry] +name = "opal" +version = "0.1.0" +description = "osu!mania score estimation through Collaborative Filtering" +authors = ["Eve-ning "] +license = "MIT" +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.9" +scikit-learn = "^1.2.1" +pandas = "^1.5.3" +pytorch-lightning = "^1.9.1" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + + +[tool.pytest.ini_options] +log_cli = true +log_cli_level = "INFO" +log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)" +log_cli_date_format = "%Y-%m-%d %H:%M:%S" \ No newline at end of file