From b3e7aaadadca9eeeb951de4f5bd9610552cc0186 Mon Sep 17 00:00:00 2001
From: Eve-ning <dev_evening@hotmail.com>
Date: Tue, 14 Feb 2023 15:50:31 +0800
Subject: [PATCH 1/9] Drop old data_ppy_sh_to_csv submod

---
 .gitmodules             | 3 ---
 opal/data_ppy_sh_to_csv | 1 -
 2 files changed, 4 deletions(-)
 delete mode 160000 opal/data_ppy_sh_to_csv

diff --git a/.gitmodules b/.gitmodules
index 2edf3d2d..e69de29b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "data_ppy_sh_to_csv"]
-	path = opal/data_ppy_sh_to_csv
-	url = https://github.com/Eve-ning/data_ppy_sh_to_csv.git
diff --git a/opal/data_ppy_sh_to_csv b/opal/data_ppy_sh_to_csv
deleted file mode 160000
index 41901131..00000000
--- a/opal/data_ppy_sh_to_csv
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 419011310dc2b849417975d277964e2849f0177f

From 32443a331eae82587ed2255885b509f656d51be8 Mon Sep 17 00:00:00 2001
From: Eve-ning <dev_evening@hotmail.com>
Date: Tue, 14 Feb 2023 15:50:51 +0800
Subject: [PATCH 2/9] Integrate new osu-data-csv module into
 score_datamodule.py

---
 opal/score/datamodule/score_datamodule.py | 26 +++++++++++------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/opal/score/datamodule/score_datamodule.py b/opal/score/datamodule/score_datamodule.py
index bda159c1..9a64fa1d 100644
--- a/opal/score/datamodule/score_datamodule.py
+++ b/opal/score/datamodule/score_datamodule.py
@@ -1,18 +1,18 @@
 import logging
-from dataclasses import dataclass, field
-from typing import Sequence, Tuple
-
 import numpy as np
 import pandas as pd
 import pytorch_lightning as pl
 import torch
+from dataclasses import dataclass, field
+from pathlib import Path
 from sklearn.base import TransformerMixin
 from sklearn.preprocessing import LabelEncoder, QuantileTransformer
 from torch.utils.data import DataLoader, TensorDataset, random_split
+from typing import Sequence, Tuple
 
 from opal.conf.conf import DATA_DIR
 from opal.conf.mods import OsuMod
-from opal.data_ppy_sh_to_csv.main import get_dataset, default_sql_names
+from osu_data_csv.main import get_dataset
 
 
 @dataclass
@@ -50,18 +50,16 @@ def __post_init__(self):
     def prepare_data(self) -> None:
         """ Downloads data via data_ppy_sh_to_csv submodule """
         get_dataset(
-            self.ds_yyyy_mm,  # year_month=
-            self.ds_mode,  # mode=
-            self.ds_set,  # set=
-            DATA_DIR,  # dl_dir=
-            'Y',  # bypass_confirm=
-            ",".join(default_sql_names[:4]),  # sql_names=
-            'N',  # cleanup=
-            'N'  # zip_csv_files=
+            year_month=self.ds_yyyy_mm,
+            mode=self.ds_mode,
+            set=self.ds_set,
+            dl_dir=DATA_DIR,
+            bypass_confirm='Y',
+            cleanup='N',
+            ignore_path=(Path(__file__).parent / "ignore_mapping.yaml").as_posix()
         )
 
     def setup(self, stage: str = "") -> None:
-        self.prepare_data()
         ds_str = f"{self.ds_yyyy_mm}_01_performance_{self.ds_mode}_top_{self.ds_set}"
 
         csv_dir = DATA_DIR / ds_str / "csv"
@@ -175,7 +173,7 @@ def prep_map(df: pd.DataFrame,
             (df['playmode'] == 3) &
             (df['diff_size'].isin(diff_sizes)) &
             (df['difficultyrating'].between(*sr_bounds)),
-            ['difficultyrating', 'diff_overall', 'diff_size', 'version', 'beatmap_id', 'filename']
+            ['difficultyrating', 'diff_overall', 'diff_size', 'beatmap_id', 'filename']
         ]
         return df
 

From 149beb01b0571e4584daa47f47acd82a6dc29c2d Mon Sep 17 00:00:00 2001
From: Eve-ning <dev_evening@hotmail.com>
Date: Tue, 14 Feb 2023 15:53:07 +0800
Subject: [PATCH 3/9] Remove unused req

---
 requirements.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 12b2cdbd..cbb734be 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,3 @@
-reamber==0.1.6
 scikit-learn
-junit_xml
 pandas
 pytorch-lightning

From 3c2b12dc82ccade4abb3cb0eda3edb342adf07e3 Mon Sep 17 00:00:00 2001
From: Eve-ning <dev_evening@hotmail.com>
Date: Tue, 14 Feb 2023 15:53:52 +0800
Subject: [PATCH 4/9] Remove unused utils

---
 opal/utils/__init__.py |   4 -
 opal/utils/ecdf.py     |   9 ---
 opal/utils/load_map.py | 178 -----------------------------------------
 3 files changed, 191 deletions(-)
 delete mode 100644 opal/utils/__init__.py
 delete mode 100644 opal/utils/ecdf.py
 delete mode 100644 opal/utils/load_map.py

diff --git a/opal/utils/__init__.py b/opal/utils/__init__.py
deleted file mode 100644
index d421fec4..00000000
--- a/opal/utils/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .ecdf import ecdf
-from .load_map import load_replay, load_replays
-
-__all__ = ['load_replay', 'load_replays']
diff --git a/opal/utils/ecdf.py b/opal/utils/ecdf.py
deleted file mode 100644
index fbd7bf87..00000000
--- a/opal/utils/ecdf.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import pandas as pd
-
-
-def ecdf(x: pd.Series):
-    counts = x.value_counts()
-    x = counts.sort_index().cumsum() / len(counts)
-    x.index = x.index.get_level_values(0)
-    x /= x.max()
-    return x
\ No newline at end of file
diff --git a/opal/utils/load_map.py b/opal/utils/load_map.py
deleted file mode 100644
index 6c29ccc4..00000000
--- a/opal/utils/load_map.py
+++ /dev/null
@@ -1,178 +0,0 @@
-from __future__ import annotations
-
-import pickle
-from copy import deepcopy
-from dataclasses import dataclass
-from pathlib import Path
-from typing import List, Tuple, Generator, Sequence
-
-import numpy as np
-import pandas as pd
-from reamber.algorithms.osu.OsuReplayError import osu_replay_error
-from reamber.algorithms.pattern import Pattern
-from reamber.base.Hold import HoldTail
-from reamber.base.lists.notes.NoteList import NoteList
-from reamber.osu.OsuHit import OsuHit
-from reamber.osu.OsuHold import OsuHold
-from reamber.osu.OsuMap import OsuMap
-
-from opal.conf import REPLAYS_DIR
-
-
-def load_replays(cache_reset=False) -> Generator[
-    Tuple[pd.DataFrame, OsuMap], None, None]:
-    for d in filter(lambda d_: d_.is_dir(), REPLAYS_DIR.iterdir()):
-        yield load_replay(d, cache_reset)
-
-
-def load_replay(map_dir: Path, cache_reset=False) -> Tuple[pd.DataFrame, OsuMap]:
-    return ReplayLoader(map_dir).load(cache_reset)
-
-
-@dataclass
-class ReplayLoader:
-    map_dir: Path
-    cache_name: str = "cache.pkl"
-
-    @staticmethod
-    def get_errors(osu: OsuMap, rep_paths: List[Path]) -> pd.DataFrame:
-        """ Gets the errors of the replays.
-
-        Args:
-            rep_paths: Paths of the replays
-            osu: Path of the osu! map
-        """
-
-        # Filter bad replays
-        rep_paths = [p for p in rep_paths if p.stat().st_size > 1000]
-
-        # Yield Replay Error
-        errors = osu_replay_error([r.as_posix() for r in rep_paths], osu)
-
-        # Get map offsets regardless of type
-        # k_o key offsets
-        df_map_offset = pd.DataFrame.from_records(
-            [(o,)
-             for _, k_o in [*errors.map_offsets.hits.items(),
-                            *errors.map_offsets.releases.items()]
-             for o in k_o],
-            columns=["offset"]
-        )
-
-        # Get replay errors as offset
-        df_errors = pd.DataFrame.from_records(
-            [(r_id, k, o)
-             for r_id, rep_offset in enumerate(errors.errors)
-             for k, k_o in [*rep_offset.hits.items(),
-                            *rep_offset.releases.items()]
-             for o in k_o],
-            columns=["r_id", "column", "error"]
-        )
-
-        return pd.merge(
-            # We combine map offsets & error
-            # For n replays, we repeat the map offsets n times
-            pd.concat([df_map_offset] * len(rep_paths)).reset_index(),
-            df_errors,
-            left_index=True,
-            right_index=True
-        ).drop('index', axis=1).astype(int).assign(
-            # Absolute error
-            error=lambda x: x.error.abs()
-        ).drop(['r_id', 'column'], axis=1).groupby(
-            # Get the median error
-            ['offset']
-        ).median().reset_index()
-
-    @staticmethod
-    def get_pattern(nls: Sequence[NoteList]) -> pd.DataFrame:
-        """ Gets the pattern of the map. """
-
-        grps = Pattern.from_note_lists(nls).group()
-
-        # Manually extract if columns are held
-        is_held = []
-        grps_hold = []
-        for grp in grps:
-            holds = [note.column for note in grp if note.type == OsuHold]
-            hits = [note.column for note in grp if note.type == OsuHit]
-            tails = [note.column for note in grp if note.type == HoldTail]
-
-            grps_hold.append(
-                [np.min(grp.offset), [*hits, *holds], deepcopy(is_held)]
-            )
-            is_held.extend(holds)
-            is_held = [c for c in is_held if c not in tails]
-
-        df = pd.DataFrame(grps_hold, columns=["offset", "columns", "is_held"])
-
-        # OHE for bigram
-        df_cols = pd.get_dummies(
-            df['columns'].apply(pd.Series, dtype=int).stack()).groupby(
-            level=0
-        ).sum()
-        df_cols.columns = [f'col_{c}' for c in range(len(df_cols.columns))]
-
-        # OHE for held
-        df_hold = pd.get_dummies(
-            df['is_held'].apply(pd.Series, dtype=int).stack()).groupby(
-            level=0
-        ).sum()
-        df_hold.columns = [f'is_held_{c}' for c in range(len(df_hold.columns))]
-
-        return (
-            pd.merge(
-                # Horizontally Join Offset & Cols
-                df['offset'], df_cols, how='left', left_index=True,
-                right_index=True
-            ).merge(
-                # Horizontally Join Offset, Cols & Held
-                df_hold, how='left', left_index=True,
-                right_index=True
-            ).fillna(
-                0
-            ).assign(
-                # Assign Delta to diff
-                diff=lambda x: x['offset'].diff().shift(-1)
-            )[:-1]
-        )
-
-    def load(self, cache_reset: bool = False) -> Tuple[pd.DataFrame, OsuMap]:
-        """ Prepare the data for the model """
-        if not cache_reset and self.cache_exists:
-            return self.cache_get()
-
-        else:
-            map_path = self.map_dir / (self.map_dir.name + ".osu")
-            osu = OsuMap.read_file(map_path.as_posix())
-            rep_dir = self.map_dir / "rep"
-            rep_paths = [p for p in rep_dir.iterdir() if p.is_file()]
-
-            data = pd.merge(
-                self.get_pattern([osu.hits, osu.holds]),
-                self.get_errors(osu, rep_paths),
-                how='left',
-                on='offset'
-            ).drop(['offset'], axis=1), osu
-
-            self.cache_set(data)
-
-        return data
-
-    @property
-    def cache_path(self) -> Path:
-        return self.map_dir / self.cache_name
-
-    def cache_set(self, data):
-        with open(self.cache_path, "wb+") as f:
-            pickle.dump(data, f)
-
-    @property
-    def cache_exists(self) -> bool:
-        return self.cache_path.exists()
-
-    def cache_get(self) -> Tuple[pd.DataFrame, OsuMap]:
-        with open(self.cache_path, "rb+") as f:
-            data = pickle.load(f)
-
-        return data

From e18b0a472edfe8b3f5fdfce641692cc21ac3164b Mon Sep 17 00:00:00 2001
From: Eve-ning <dev_evening@hotmail.com>
Date: Tue, 14 Feb 2023 15:54:14 +0800
Subject: [PATCH 5/9] Remove empty .gitmodules

---
 .gitmodules | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 .gitmodules

diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index e69de29b..00000000

From 0ecae0b6b3104a7bd8d4d0bfbdebf1301583d444 Mon Sep 17 00:00:00 2001
From: Eve-ning <dev_evening@hotmail.com>
Date: Tue, 14 Feb 2023 15:56:18 +0800
Subject: [PATCH 6/9] Add basic pyproject.toml

---
 pyproject.toml | 5 -----
 1 file changed, 5 deletions(-)
 delete mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index ec33f447..00000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,5 +0,0 @@
-[tool.pytest.ini_options]
-log_cli = true
-log_cli_level = "INFO"
-log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
-log_cli_date_format = "%Y-%m-%d %H:%M:%S"
\ No newline at end of file

From 4ecaee3e1cb5c730ee1db3105fe62ff0c6ee3535 Mon Sep 17 00:00:00 2001
From: Eve-ning <dev_evening@hotmail.com>
Date: Tue, 14 Feb 2023 15:56:35 +0800
Subject: [PATCH 7/9] Add ignore_mapping.yaml

---
 opal/score/datamodule/ignore_mapping.yaml | 92 +++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 opal/score/datamodule/ignore_mapping.yaml

diff --git a/opal/score/datamodule/ignore_mapping.yaml b/opal/score/datamodule/ignore_mapping.yaml
new file mode 100644
index 00000000..e74fb3a4
--- /dev/null
+++ b/opal/score/datamodule/ignore_mapping.yaml
@@ -0,0 +1,92 @@
+# This is the default ignore mapping.
+# For every column that is NOT commented out, it'll be IGNORED when converting
+# Try to ignore those that are not necessary for your use
+# This will heavily reduce storage needed and increase processing speed
+# Once you're done, reference this file when calling osu-data-csv via
+# osu-data-csv -i path/to/ignore_mapping.yaml
+
+osu_beatmap_difficulty.sql:
+#  - beatmap_id
+#  - mode
+#  - mods
+#  - diff_unified
+  - last_update
+osu_beatmaps.sql:
+#  - beatmap_id
+  - beatmapset_id
+  - user_id
+#  - filename
+  - checksum
+#  - version
+  - total_length
+  - hit_length
+#  - countTotal
+#  - countNormal
+#  - countSlider
+#  - countSpinner
+#  - diff_drain
+#  - diff_size
+#  - diff_overall
+#  - diff_approach
+#  - playmode
+#  - approved
+  - last_update
+#  - difficultyrating
+#  - playcount
+#  - passcount
+#  - youtube_preview
+#  - score_version
+#  - deleted_at
+  - bpm
+osu_scores{mode}_high.sql:
+#  - score_id
+#  - beatmap_id
+#  - user_id
+#  - score
+  - maxcombo
+  - rank
+#  - count50
+#  - count100
+#  - count300
+#  - countmiss
+#  - countgeki
+#  - countkatu
+  - perfect
+#  - enabled_mods
+#  - date
+#  - pp
+#  - replay
+  - hidden
+  - country_acronym
+osu_user_stats{mode}.sql:
+#  - user_id
+  - count300
+  - count100
+  - count50
+  - countMiss
+  - accuracy_total
+  - accuracy_count
+  - accuracy
+  - playcount
+  - ranked_score
+  - total_score
+  - x_rank_count
+  - xh_rank_count
+  - s_rank_count
+  - sh_rank_count
+  - a_rank_count
+  - rank
+  - level
+  - replay_popularity
+  - fail_count
+  - exit_count
+  - max_combo
+  - country_acronym
+#  - rank_score
+#  - rank_score_index
+  - rank_score_exp
+  - rank_score_index_exp
+  - accuracy_new
+  - last_update
+  - last_played
+  - total_seconds_played

From 61b6c101d82f607b2e772d9121082e97b71397d7 Mon Sep 17 00:00:00 2001
From: Eve-ning <dev_evening@hotmail.com>
Date: Tue, 14 Feb 2023 16:02:38 +0800
Subject: [PATCH 8/9] Remove unused replay package

---
 opal/conf/conf.py       | 1 -
 opal/replay/__init__.py | 0
 2 files changed, 1 deletion(-)
 delete mode 100644 opal/replay/__init__.py

diff --git a/opal/conf/conf.py b/opal/conf/conf.py
index f1416c6c..aa7ad96b 100644
--- a/opal/conf/conf.py
+++ b/opal/conf/conf.py
@@ -4,5 +4,4 @@
 DATA_DIR = ROOT_DIR / "data/"
 OSU_DIR = DATA_DIR / "osu/"
 MODEL_DIR = ROOT_DIR / "models/"
-REPLAYS_DIR = OSU_DIR / "replays"
 SCORES_DIR = OSU_DIR / "scores"
diff --git a/opal/replay/__init__.py b/opal/replay/__init__.py
deleted file mode 100644
index e69de29b..00000000

From aabac6f85bc611085da5fdced2318360399aa4b1 Mon Sep 17 00:00:00 2001
From: Eve-ning <dev_evening@hotmail.com>
Date: Tue, 14 Feb 2023 16:09:47 +0800
Subject: [PATCH 9/9] Add poetry init

---
 pyproject.toml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..e76622bf
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,25 @@
+[tool.poetry]
+name = "opal"
+version = "0.1.0"
+description = "osu!mania score estimation through Collaborative Filtering"
+authors = ["Eve-ning <dev_evening@hotmail.com>"]
+license = "MIT"
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.9"
+scikit-learn = "^1.2.1"
+pandas = "^1.5.3"
+pytorch-lightning = "^1.9.1"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+
+[tool.pytest.ini_options]
+log_cli = true
+log_cli_level = "INFO"
+log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
+log_cli_date_format = "%Y-%m-%d %H:%M:%S"
\ No newline at end of file