Merge branch 'felixbur:main' into master

bagustris · Apr 26, 2024 · 2883a9a · 2883a9a
2 parents dbe895f + de0131a
commit 2883a9a
Show file tree

Hide file tree

Showing 17 changed files with 168 additions and 384 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,26 @@
 Changelog
 =========
 
+Version 0.83.0
+--------------
+* test module now prints out reports
+
+Version 0.82.4
+--------------
+* fixed bug in wavlm 
+
+Version 0.82.3
+--------------
+* fixed another audformat peculiarity to interprete time values as nanoseconds
+
+Version 0.82.2
+--------------
+* fixed audformat peculiarity that dataframes can have only one column
+
+Version 0.82.1
+--------------
+* Add more test for GC action
+
 Version 0.82.0
 --------------
 * added nkuluflag module

diff --git a/...mos/multiple_exeriments/do_experiments.py → ...os/multiple_experiments/do_experiments.py b/...mos/multiple_exeriments/do_experiments.py → ...os/multiple_experiments/do_experiments.py
@@ -19,13 +19,13 @@
     # {'--feat': 'os',
     # '--set': 'ComParE_2016',
     # },
-    {"--feat": "audmodel"},
+    {"--feat": "praat"},
 ]
 
 
 for c in classifiers:
     for f in features:
-        cmd = "python -m nkululeko.nkuluflag --config exp.ini "
+        cmd = "python -m nkululeko.nkuluflag --config meta/demos/multiple_exeriments/exp.ini "
         for item in c:
             cmd += f"{item} {c[item]} "
         for item in f:

diff --git a/meta/demos/multiple_exeriments/exp.ini → meta/demos/multiple_experiments/exp.ini b/meta/demos/multiple_exeriments/exp.ini → meta/demos/multiple_experiments/exp.ini
@@ -11,6 +11,7 @@ emodb.train_tables = ['emotion.categories.train.gold_standard']
 emodb.test_tables = ['emotion.categories.test.gold_standard']
 target = emotion
 labels = ['anger', 'happiness']
+tests = ['emodb']
 [FEATS]
 [MODEL]
 C_val = .001

diff --git a/meta/demos/multiple_experiments/tmp.ini b/meta/demos/multiple_experiments/tmp.ini
@@ -0,0 +1,28 @@
+[EXP]
+root = ./
+name = results
+runs = 1
+epochs = 1
+
+[DATA]
+databases = ['emodb']
+emodb = ../../../data/emodb/emodb
+emodb.split_strategy = specified
+emodb.train_tables = ['emotion.categories.train.gold_standard']
+emodb.test_tables = ['emotion.categories.test.gold_standard']
+target = emotion
+labels = ['anger', 'happiness']
+
+[FEATS]
+type = ['praat']
+
+[MODEL]
+c_val = .001
+learning_rate = 0.0001
+store = True
+patience = 5
+type = svm
+
+[PLOT]
+best_model = True
+
diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.82.0"
+VERSION="0.83.0"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/data/dataset_csv.py b/nkululeko/data/dataset_csv.py
@@ -22,7 +22,18 @@ def load(self):
         #     data_file = os.path.join(exp_root, data_file)
         root = os.path.dirname(data_file)
         audio_path = self.util.config_val_data(self.name, "audio_path", "")
-        df = audformat.utils.read_csv(data_file)
+        df = pd.read_csv(data_file)
+        # special treatment for segmented dataframes with only one column:
+        if "start" in df.columns and len(df.columns) == 4:
+            index = audformat.segmented_index(
+                df.file.values, df.start.values, df.end.values
+            )
+            df = df.set_index(index)
+            df = df.drop(columns=["file", "start", "end"])
+        else:
+            df = audformat.utils.read_csv(data_file)
+        if isinstance(df, pd.Series):
+            df = df.to_frame()
         rename_cols = self.util.config_val_data(self.name, "colnames", False)
         if rename_cols:
             col_dict = ast.literal_eval(rename_cols)

diff --git a/nkululeko/experiment.py b/nkululeko/experiment.py
@@ -23,7 +23,7 @@
 from nkululeko.reporting.report import Report
 from nkululeko.runmanager import Runmanager
 from nkululeko.scaler import Scaler
-from nkululeko.test_predictor import Test_predictor
+from nkululeko.test_predictor import TestPredictor
 from nkululeko.utils.util import Util
 
 
@@ -672,7 +672,7 @@ def demo(self, file, is_list, outfile):
     def predict_test_and_save(self, result_name):
         model = self.runmgr.get_best_model()
         model.set_testdata(self.df_test, self.feats_test)
-        test_predictor = Test_predictor(
+        test_predictor = TestPredictor(
             model, self.df_test, self.label_encoder, result_name
         )
         test_predictor.predict_and_store()

diff --git a/nkululeko/feat_extract/feats_hubert.py b/nkululeko/feat_extract/feats_hubert.py
@@ -6,23 +6,26 @@
 
 import os
 
-import audeer
-import nkululeko.glob_conf as glob_conf
 import pandas as pd
 import torch
 import torchaudio
-from audformat.utils import map_file_path
-from nkululeko.feat_extract.featureset import Featureset
 from tqdm import tqdm
-from transformers import HubertModel, Wav2Vec2FeatureExtractor
+from transformers import HubertModel
+from transformers import Wav2Vec2FeatureExtractor
+
+from nkululeko.feat_extract.featureset import Featureset
+import nkululeko.glob_conf as glob_conf
 
 
 class Hubert(Featureset):
-    """Class to extract HuBERT embedding)"""
+    """Class to extract HuBERT embedding)."""
 
     def __init__(self, name, data_df, feat_type):
-        """Constructor. is_train is needed to distinguish from test/dev sets,
-        because they use the codebook from the training"""
+        """Constructor.
+
+        Is_train is needed to distinguish from test/dev sets,
+        because they use the codebook from the training.
+        """
         super().__init__(name, data_df, feat_type)
         # check if device is not set, use cuda if available
         cuda = "cuda" if torch.cuda.is_available() else "cpu"
@@ -61,16 +64,12 @@ def extract(self):
         """Extract the features or load them from disk if present."""
         store = self.util.get_path("store")
         storage = f"{store}{self.name}.pkl"
-        extract = self.util.config_val(
-            "FEATS", "needs_feature_extraction", False
-        )
+        extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
         no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
         if extract or no_reuse or not os.path.isfile(storage):
             if not self.model_initialized:
                 self.init_model()
-            self.util.debug(
-                "extracting Hubert embeddings, this might take a while..."
-            )
+            self.util.debug("extracting Hubert embeddings, this might take a while...")
             emb_series = pd.Series(index=self.data_df.index, dtype=object)
             length = len(self.data_df.index)
             for idx, (file, start, end) in enumerate(
@@ -84,9 +83,7 @@ def extract(self):
                 assert sampling_rate == 16000
                 emb = self.get_embeddings(signal, sampling_rate, file)
                 emb_series.iloc[idx] = emb
-            self.df = pd.DataFrame(
-                emb_series.values.tolist(), index=self.data_df.index
-            )
+            self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)
             self.df.to_pickle(storage)
             try:
                 glob_conf.config["DATA"]["needs_feature_extraction"] = "false"

diff --git a/nkululeko/feat_extract/feats_wavlm.py b/nkululeko/feat_extract/feats_wavlm.py
@@ -4,27 +4,32 @@
 
 import os
 
-import nkululeko.glob_conf as glob_conf
 import pandas as pd
 import torch
 import torchaudio
-from nkululeko.feat_extract.featureset import Featureset
 from tqdm import tqdm
-from transformers import Wav2Vec2FeatureExtractor, WavLMModel
+from transformers import Wav2Vec2FeatureExtractor
+from transformers import WavLMModel
+
+from nkululeko.feat_extract.featureset import Featureset
+import nkululeko.glob_conf as glob_conf
 
 
 class Wavlm(Featureset):
-    """Class to extract WavLM embedding)"""
+    """Class to extract WavLM embedding)."""
+
+    def __init__(self, name, data_df, feats_type):
+        """Constructor.
 
-    def __init__(self, name, data_df, feat_type):
-        """Constructor. is_train is needed to distinguish from test/dev sets,
-        because they use the codebook from the training"""
-        super().__init__(name, data_df)
+        Is_train is needed to distinguish from test/dev sets,
+        because they use the codebook from the training.
+        """
+        super().__init__(name, data_df, feats_type)
         # check if device is not set, use cuda if available
         cuda = "cuda" if torch.cuda.is_available() else "cpu"
         self.device = self.util.config_val("MODEL", "device", cuda)
         self.model_initialized = False
-        self.feat_type = feat_type
+        self.feat_type = feats_type
 
     def init_model(self):
         # load model
@@ -59,7 +64,9 @@ def extract(self):
                     frame_offset=int(start.total_seconds() * 16000),
                     num_frames=int((end - start).total_seconds() * 16000),
                 )
-                assert sampling_rate == 16000, f"sampling rate should be 16000 but is {sampling_rate}"
+                assert (
+                    sampling_rate == 16000
+                ), f"sampling rate should be 16000 but is {sampling_rate}"
                 emb = self.get_embeddings(signal, sampling_rate, file)
                 emb_series.iloc[idx] = emb
             self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index)

diff --git a/nkululeko/modelrunner.py b/nkululeko/modelrunner.py
@@ -2,18 +2,16 @@
 
 import pandas as pd
 
-from nkululeko.utils.util import Util
 from nkululeko import glob_conf
-import nkululeko.glob_conf as glob_conf
+from nkululeko.utils.util import Util
 
 
 class Modelrunner:
-    """
-    Class to model one run
-    """
+    """Class to model one run."""
 
     def __init__(self, df_train, df_test, feats_train, feats_test, run):
-        """Constructor setting up the dataframes
+        """Constructor setting up the dataframes.
+
         Args:
             df_train: train dataframe
             df_test: test dataframe