Version 0.3.0

Labbeti · Apr 18, 2024 · 6739981 · 6739981
1 parent 13cd53e
commit 6739981
Show file tree

Hide file tree

Showing 113 changed files with 3,494 additions and 6,136 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,36 @@
+[flake8]
+max-line-length = 88
+extend-ignore =
+    # Indentation contains tabs
+    W191,
+    # Whitespace before ':'
+    E203,
+    # Line too long
+    E501,
+    # Module level import not at top of file
+    E402
+show_source = True
+exclude =
+    # No need to traverse our git directory
+    .git,
+    # There's no value in checking cache directories
+    __pycache__,
+    # The conf file is mostly autogenerated, ignore it
+    docs/source/conf.py,
+    # The old directory contains Flake8 2.0
+    old,
+    # This contains our built documentation
+    build,
+    # This contains builds of flake8 that we don't want to check
+    dist,
+    # Ignore notebook checkpoints
+    .ipynb_checkpoints
+    # Ignore data
+    data
+    # Ignore logs
+    logs
+    # Ignore old legacy files
+    legacy
+per-file-ignores =
+    # imported but unused
+    __init__.py: F401
diff --git a/.github/workflows/inference.yaml b/.github/workflows/inference.yaml
@@ -49,9 +49,9 @@ jobs:
 
     - name: Install local packages
       run: |
-        python -m pip install -e .[dev]
+        python -m pip install -e .[test,dev]
 
-    # --- TESTS ---  
+    # --- TESTS ---
     - name: Check format with Black
       run: |
         python -m black --check --diff src

diff --git a/.github/workflows/training.yaml b/.github/workflows/training.yaml
@@ -80,7 +80,7 @@ jobs:
       if: ${{ steps.cache_preparation.outputs.cache-hit != 'true' }}
       run: |
         echo "Prepare data in dataroot '$DATAROOT'"
-        cnext_bl_path="$HOME/.cache/torch/hub/checkpoints/convnext_tiny_465mAP_BL_AC_70kit.pth"
+        cnext_bl_path="cnext_bl_75"
         conette-prepare data=clotho default=true pann=false pack_to_hdf=true data.clean_archives=true data.subsets=[val] audio_t.src_sr=44100 audio_t.pretrain_path=${cnext_bl_path} post_hdf_name=bl pretag=cnext_bl csum_in_hdf_name=false path.data=$DATAROOT verbose=2
 
     # --- TESTS ---

diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@ Labbeti/conette/
 dist/
 logs/
 data/
+data_tmp*/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,35 @@
+# exclude: ""
+
+repos:
+    # Format Code
+    - repo: https://github.com/ambv/black
+      rev: 23.12.1
+      hooks:
+        - id: black
+
+    # Sort imports
+    - repo: https://github.com/PyCQA/isort
+      rev: 5.12.0
+      hooks:
+      - id: isort
+        args: ["--profile", "black"]
+
+    # Formatting, Whitespace, etc
+    - repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v2.2.3
+      hooks:
+      - id: trailing-whitespace
+      - id: check-added-large-files
+        args: ['--maxkb=2000']
+      - id: check-ast
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-xml
+      - id: check-yaml
+      - id: debug-statements
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: mixed-line-ending
+        args: ['--fix=no']
+      - id: flake8
+        args: ['--config=.flake8']
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,16 @@
 
 All notable changes to this project will be documented in this file.
 
+## [0.3.0] 2024-04-18
+### Changed
+- Update dependencies with `torchoutil`, and clean a lot of dead code.
+- Convnext model (BL version) is now automatically selected during HDF preparation. ([#5](https://github.com/Labbeti/conette-audio-captioning/issues/5))
+
+### Fixed
+- Requirements versions specified during installation. ([#4](https://github.com/Labbeti/conette-audio-captioning/issues/4))
+- Preparation now correctly loads `scale_layers` in ConvNext during preparation. ([#5](https://github.com/Labbeti/conette-audio-captioning/issues/5))
+- ConvNeXt 75 iterations is now loaded instead of ConvNeXt 70 iterations for CoNeTTE.
+
 ## [0.2.2] 2024-01-15
 ### Added
 - Multiple candidates, predictions and probabilities in model outputs.

diff --git a/README.md b/README.md
@@ -9,13 +9,63 @@
 
 </div>
 
-CoNeTTE is an audio captioning system, which generate a short textual description of the sound events in any audio file. The architecture and training are explained in the corresponding [paper](https://arxiv.org/pdf/2309.00454.pdf). The model has been developped by me ([Étienne Labbé](https://labbeti.github.io/)) during my PhD. A simple interface to test CoNeTTE is available on [HuggingFace website](https://huggingface.co/spaces/Labbeti/conette).
+CoNeTTE is an audio captioning system, which generate a short textual description of the sound events in any audio file. The architecture and training are explained in the [corresponding paper](https://arxiv.org/pdf/2309.00454.pdf). The model has been developped by me ([Étienne Labbé](https://labbeti.github.io/)) during my PhD. A simple interface to test CoNeTTE is available on [HuggingFace website](https://huggingface.co/spaces/Labbeti/conette).
 
-## Inference
+## Training
+### Requirements
+- Intended for Ubuntu 20.04 only. Requires **java** < 1.13, **ffmpeg**, **yt-dlp**, and **zip** commands.
+- Recommanded GPU: NVIDIA V100 with 32GB VRAM.
+- WavCaps dataset might requires more than 2 TB of disk storage. Other datasets requires less than 50 GB.
+
+### Installation
+By default, **only the pip inference requirements are installed for conette**. To install training requirements you need to use the following command:
+```bash
+python -m pip install conette[train]
+```
+If you already installed conette for inference, it is **highly recommanded to create another environment** before installing conette for training.
+
+### Download external models and data
+These steps might take a while (few hours to download and prepare everything depending on your CPU, GPU and SSD/HDD).
+
+First, download the ConvNeXt, NLTK and spacy models :
+```bash
+conette-prepare data=none default=true pack_to_hdf=false csum_in_hdf_name=false pann=false
+```
+
+Then download the 4 datasets used to train CoNeTTE :
+```bash
+common_args="data.download=true pack_to_hdf=true audio_t=resample_mean_convnext audio_t.pretrain_path=cnext_bl_75 post_hdf_name=bl pretag=cnext_bl_75"
+
+conette-prepare data=audiocaps audio_t.src_sr=32000 ${common_args}
+conette-prepare data=clotho audio_t.src_sr=44100 ${common_args}
+conette-prepare data=macs audio_t.src_sr=48000 ${common_args}
+conette-prepare data=wavcaps audio_t.src_sr=32000 ${common_args} datafilter.min_audio_size=0.1 datafilter.max_audio_size=30.0 datafilter.sr=32000
+```
+
+### Train a model
+CNext-trans (baseline) on CL only (~3 hours on 1 GPU V100-32G)
+```bash
+conette-train expt=[clotho_cnext_bl] pl=baseline
+```
+
+CoNeTTE on AC+CL+MA+WC, specialized for CL (~4 hours on 1 GPU V100-32G)
+```bash
+conette-train expt=[camw_cnext_bl_for_c,task_ds_src_camw] pl=conette
+```
+
+CoNeTTE on AC+CL+MA+WC, specialized for AC (~3 hours on 1 GPU V100-32G)
+```bash
+conette-train expt=[camw_cnext_bl_for_a,task_ds_src_camw] pl=conette
+```
+
+Note 1: any training using AC data cannot be exactly reproduced because a part of this data is deleted from the YouTube source, and I cannot share my own audio files.
+Note 2: paper results are averaged scores over 5 seeds (1234-1238). The default training only uses seed 1234.
+
+## Inference only (without training)
 
 ### Installation
 ```bash
-python -m pip install conette
+python -m pip install conette[test]
 ```
 
 ### Usage with python
@@ -71,70 +121,20 @@ The model has been trained on AudioCaps (AC), Clotho (CL), MACS (MA) and WavCaps
 
 | Test data | SPIDEr (%) | SPIDEr-FL (%) | FENSE (%) | Vocab | Outputs | Scores |
 | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
-| AC-test | 44.14 | 43.98 | 60.81 | 309 | [Link](https://github.com/Labbeti/conette-audio-captioning/blob/main/results/conette/outputs_audiocaps_test.csv) | [Link](https://github.com/Labbeti/conette-audio-captioning/blob/main/results/conette/scores_audiocaps_test.yaml) |
-| CL-eval | 30.97 | 30.87 | 51.72 | 636 | [Link](https://github.com/Labbeti/conette-audio-captioning/blob/main/results/conette/outputs_clotho_eval.csv) | [Link](https://github.com/Labbeti/conette-audio-captioning/blob/main/results/conette/scores_clotho_eval.yaml) |
+| AC-test | 44.14 | 43.98 | 60.81 | 309 | [Link](https://github.com/Labbeti/conette-audio-captioning/blob/main/results/detailed_outputs/outputs_audiocaps_test.csv) | [Link](https://github.com/Labbeti/conette-audio-captioning/blob/main/results/detailed_outputs/scores_audiocaps_test.yaml) |
+| CL-eval | 30.97 | 30.87 | 51.72 | 636 | [Link](https://github.com/Labbeti/conette-audio-captioning/blob/main/results/detailed_outputs/outputs_clotho_eval.csv) | [Link](https://github.com/Labbeti/conette-audio-captioning/blob/main/results/detailed_outputs/scores_clotho_eval.yaml) |
 
 This model checkpoint has been trained with focus on the Clotho dataset, but it can also reach a good performance on AudioCaps with the "audiocaps" task.
 
 ### Limitations
 - The model expected audio sampled at **32 kHz**. The model automatically resample up or down the input audio files. However, it might give worse results, especially when using audio with lower sampling rates.
 - The model has been trained on audio lasting from **1 to 30 seconds**. It can handle longer audio files, but it might require more memory and give worse results.
 
-## Train a model
-### Requirements
-- Intended for Ubuntu 20.04 only. Requires **java** < 1.13, **ffmpeg**, **yt-dlp**, and **zip** commands.
-- Recommanded GPU: NVIDIA V100 with 32GB VRAM.
-- WavCaps dataset might requires more than 2 TB of disk storage. Other datasets requires less than 50 GB.
-
-### Installation
-By default, **only the inference requirements are installed for conette**. To install training requirements you need to use the following command:
-```bash
-python -m pip install conette[train]
-```
-If you already installed conette for inference, it is **highly recommanded to create another environment** before installing conette for training.
-
-### Download external models and data
-These steps might take a while (few hours to download and prepare everything depending on your CPU, GPU and SSD/HDD).
-
-First, download the ConvNeXt, NLTK and spacy models :
-```bash
-conette-prepare data=none default=true pack_to_hdf=false csum_in_hdf_name=false pann=false
-```
-
-Then download the 4 datasets used to train CoNeTTE :
-```bash
-cnext_bl_path="$HOME/.cache/torch/hub/checkpoints/convnext_tiny_465mAP_BL_AC.pth"
-common_args="data.download=true pack_to_hdf=true audio_t=resample_mean_convnext audio_t.pretrain_path=${cnext_bl_path} post_hdf_name=bl pretag=cnext_bl"
-
-conette-prepare data=audiocaps audio_t.src_sr=32000 ${common_args}
-conette-prepare data=clotho audio_t.src_sr=44100 ${common_args}
-conette-prepare data=macs audio_t.src_sr=48000 ${common_args}
-conette-prepare data=wavcaps audio_t.src_sr=32000 ${common_args} datafilter.min_audio_size=0.1 datafilter.max_audio_size=30.0 datafilter.sr=32000
-```
-
-### Train a model
-CNext-trans (baseline) on CL only (~3 hours on 1 GPU V100-32G)
-```bash
-conette-train expt=[clotho_cnext_bl] pl=baseline
-```
-
-CoNeTTE on AC+CL+MA+WC, specialized for CL (~4 hours on 1 GPU V100-32G)
-```bash
-conette-train expt=[camw_cnext_bl_for_c,task_ds_src_camw] pl=conette
-```
-
-CoNeTTE on AC+CL+MA+WC, specialized for AC (~3 hours on 1 GPU V100-32G)
-```bash
-conette-train expt=[camw_cnext_bl_for_a,task_ds_src_camw] pl=conette
-```
-
-**About reproducibility** : any training with AC data cannot be reproduced because a part of this data is deleted from the YouTube source, and I cannot share my own audio files.
-
 ## Citation
 The preprint version of the paper describing CoNeTTE is available on arxiv: https://arxiv.org/pdf/2309.00454.pdf
 
 ```bibtex
-@misc{labbé2023conette,
+@misc{labbe2023conette,
 	title        = {CoNeTTE: An efficient Audio Captioning system leveraging multiple datasets with Task Embedding},
 	author       = {Étienne Labbé and Thomas Pellegrini and Julien Pinquier},
 	year         = 2023,
@@ -149,8 +149,8 @@ The preprint version of the paper describing CoNeTTE is available on arxiv: http
 ## Additional information
 - CoNeTTE stands for **Co**nv**Ne**Xt-**T**ransformer with **T**ask **E**mbedding.
 - Model weights are available on HuggingFace: https://huggingface.co/Labbeti/conette
-- The weights of the encoder part of the architecture is based on a ConvNeXt model for audio classification, available here: https://zenodo.org/record/8020843 under the filename "convnext_tiny_465mAP_BL_AC_70kit.pth".
+- The weights of the encoder part of the architecture is based on a ConvNeXt model for audio classification, available here: https://zenodo.org/records/10987498 under the filename "convnext_tiny_465mAP_BL_AC_75kit.pth".
 
 ## Contact
 Maintainer:
-- Etienne Labbé "Labbeti": labbeti.pub@gmail.com
+- [Étienne Labbé](https://labbeti.github.io/) "Labbeti": labbeti.pub@gmail.com
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,6 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
     "Topic :: Scientific/Engineering",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
@@ -39,7 +38,7 @@ include = ["conette*"]  # package names should match these glob patterns (["*"]
 [tool.setuptools.dynamic]
 version = {attr = "conette.__version__"}
 dependencies = {file = ["requirements.txt"]}
-optional-dependencies = { dev = { file = ["requirements-dev.txt"] }, train = { file = ["requirements-train.txt"]}}
+optional-dependencies = { dev = { file = ["requirements-dev.txt"] }, train = { file = ["requirements-train.txt"]}, test = { file = []}}
 
 [tool.ruff]
 ignore = ["E501", "E402"]
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,7 +1,8 @@
 # -*- coding: utf-8 -*-
 
-pytest
-flake8
-black
-ipykernel
-twine
+black==23.12.1
+flake8==6.1.0
+ipykernel==6.27.1
+pre-commit==3.7.0
+pytest==7.4.3
+twine==4.0.1
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,3 +6,4 @@ Labbeti/conette/ @@
     dist/
     logs/
     data/
+    data_tmp*/