From b9b802a32ec525864a90784b2c1fe1a9f67834b3 Mon Sep 17 00:00:00 2001 From: bknota Date: Tue, 27 Feb 2024 01:20:01 +0000 Subject: [PATCH 1/5] add readme; modify commands --- .gitignore | 5 ++-- Dockerfile | 2 +- README.md | 75 +++++++++++++++++++++++++++++++++++++++++++++- app.py | 2 +- app.sh | 4 +++ docker-compose.yml | 11 ++----- download.sh | 9 +++++- inference.py | 2 +- inference.sh | 20 ++++++++++--- preprocess.sh | 7 ++++- requirements.txt | 2 +- 11 files changed, 117 insertions(+), 22 deletions(-) create mode 100644 app.sh diff --git a/.gitignore b/.gitignore index 67ac225..e56eeef 100644 --- a/.gitignore +++ b/.gitignore @@ -19,8 +19,7 @@ __pycache__ *.gz *.json -results/ +results* temp/ sample* - -app.sh \ No newline at end of file +data/lrs3_v0.4_txt/lrs3_v0.4/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 3cc3c5e..3d78479 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,5 +3,5 @@ FROM nvcr.io/nvidia/pytorch:22.03-py3 ARG DEBIAN_FRONTEND=noninteractive RUN apt-get update RUN apt-get install ffmpeg libsm6 libxext6 tmux git -y - +RUN pip install --no-cache -r requirements.txt WORKDIR /workspace diff --git a/README.md b/README.md index 02a2a69..f9b4260 100644 --- a/README.md +++ b/README.md @@ -10,4 +10,77 @@ pinned: true license: apache-2.0 --- -# README here \ No newline at end of file +# 28× Compressed Wav2Lip by Nota AI + +Official codebase for [**Accelerating Speech-Driven Talking Face Generation with 28× Compressed Wav2Lip**](https://arxiv.org/abs/2304.00471). + +- Presented at [ICCV'23 Demo](https://iccv2023.thecvf.com/demos-111.php) Track; [On-Device Intelligence Workshop](https://sites.google.com/g.harvard.edu/on-device-workshop-23/home) @ MLSys'23; [NVIDIA GTC 2023](https://www.nvidia.com/en-us/on-demand/search/?facet.mimetype[]=event%20session&layout=list&page=1&q=52409&sort=relevance&sortDir=desc) Poster. + + +## Installation +#### Docker (recommended) +```bash +docker compose run --service-ports --name nota-compressed-wav2lip compressed-wav2lip bash +``` + +#### Conda +
+Click + +```bash +conda create -n nota-wav2lip python=3.9 +conda activate nota-wav2lip +git clone https://github.com/Nota-NetsPresso/nota-wav2lip.git +cd nota-wav2lip +pip install -r requirements.txt +apt-get update +apt-get install ffmpeg libsm6 libxext6 tmux git -y +``` +
+ +## Gradio Demo +Use the below script to run the [nota-ai/compressed-wav2lip demo](https://huggingface.co/spaces/nota-ai/compressed-wav2lip). The models and sample data will be downloaded automatically. + + ```bash + bash app.sh + ``` + +## Inference +(1) Download YouTube videos in the LRS3-TED label text file and preprocess them properly. + - Download `lrs3_v0.4_txt.zip` from [this link](https://mmai.io/datasets/lip_reading/). + - Unzip the file and make a folder structure: `./data/lrs3_v0.4_txt/lrs3_v0.4/test` + - Run `bash download.sh` + - Run `bash preprocess.sh` + +(2) Run the script to compare the original Wav2Lip with Nota's compressed version. + + ```bash + bash inference.sh + ``` + +## License +The purpose of this repository, along with its model weights, is strictly for research and non-commercial projects. + +## Contact +- To obtain compression code and assistance, kindly contact Nota AI. These are provided as part of our business solutions (for business inquiries: contact@nota.ai). +- For Q&A about this repo, use this board: [Nota-NetsPresso/discussions](https://github.com/orgs/Nota-NetsPresso/discussions) + +## Acknowledgment + - [NVIDIA Applied Research Accelerator Program](https://www.nvidia.com/en-us/industries/higher-education-research/applied-research-program/) for supporting this research. + - [Wav2Lip](https://github.com/Rudrabha/Wav2Lip) and [LRS3-TED](https://www.robots.ox.ac.uk/~vgg/data/lip_reading/) for facilitating the development of the original Wav2Lip. + +## Citation +```bibtex +@article{kim2023unified, + title={A Unified Compression Framework for Efficient Speech-Driven Talking-Face Generation}, + author={Kim, Bo-Kyeong and Kang, Jaemin and Seo, Daeun and Park, Hancheol and Choi, Shinkook and Song, Hyoung-Kyu and Kim, Hyungshin and Lim, Sungsu}, + journal={MLSys Workshop on On-Device Intelligence (ODIW)}, + year={2023}, + url={https://arxiv.org/abs/2304.00471} +} +``` + + + + + diff --git a/app.py b/app.py index 517f54e..1ecf536 100644 --- a/app.py +++ b/app.py @@ -102,4 +102,4 @@ gr.Markdown(Path('docs/footer.md').read_text()) - demo.queue().launch() + demo.queue().launch(share=True) diff --git a/app.sh b/app.sh new file mode 100644 index 0000000..be817bb --- /dev/null +++ b/app.sh @@ -0,0 +1,4 @@ +export LRS_ORIGINAL_URL=https://netspresso-huggingface-demo-checkpoint.s3.us-east-2.amazonaws.com/compressed-wav2lip/lrs3-wav2lip.pth && \ +export LRS_COMPRESSED_URL=https://netspresso-huggingface-demo-checkpoint.s3.us-east-2.amazonaws.com/compressed-wav2lip/lrs3-nota-wav2lip.pth && \ +export LRS_INFERENCE_SAMPLE=https://netspresso-huggingface-demo-checkpoint.s3.us-east-2.amazonaws.com/data/compressed-wav2lip-inference/sample.tar.gz && \ +python app.py \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 8935901..c0820a0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,16 +1,11 @@ version: "3.9" - -# docker compose run --service-ports --name compressed-wav2lip-hksong compressed-wav2lip bash - services: compressed-wav2lip: - image: compressed-wav2lip:dev + image: nota-compressed-wav2lip:dev build: ./ - container_name: efficient-wav2lip-hksong + container_name: nota-compressed-wav2lip ipc: host ports: - "7860:7860" volumes: - - /data2/hksong/compressed-wav2lip:/workspace - - /data2/hksong/DATA:/DATA - - /data2/hksong/LOG:/LOG + - ./:/workspace \ No newline at end of file diff --git a/download.sh b/download.sh index ace8174..c282ce3 100644 --- a/download.sh +++ b/download.sh @@ -1,2 +1,9 @@ +# example for audio source python download.py\ - -i 00003.txt \ No newline at end of file + -i data/lrs3_v0.4_txt/lrs3_v0.4/test/sxnlvwprfSc/00007.txt + +# example for video source +python download.py\ + -i data/lrs3_v0.4_txt/lrs3_v0.4/test/Li4S1yyrsTI/00010.txt + + \ No newline at end of file diff --git a/inference.py b/inference.py index a2eaaf7..d204231 100644 --- a/inference.py +++ b/inference.py @@ -47,7 +47,7 @@ def parse_args(): '--model', choices=['wav2lip', 'nota_wav2lip'], default='nota_wav2ilp', - help="Model for generating talking video. Defaults: wav2lip" + help="Model for generating talking video. Defaults: nota_wav2lip" ) parser.add_argument( diff --git a/inference.sh b/inference.sh index d7c9ff0..f139104 100644 --- a/inference.sh +++ b/inference.sh @@ -1,6 +1,18 @@ + +# Original Wav2Lip python inference.py\ - -a "sample/1673_orig.wav"\ - -v "sample_video_lrs3/EV3OmxrowWE-00003"\ + -a "sample_video_lrs3/sxnlvwprf_c-00007.wav"\ + -v "sample_video_lrs3/Li4-1yyrsTI-00010"\ + -m "wav2lip"\ + -o "result_original"\ + --device cpu + +# Nota's Wav2Lip (28× Compressed) +python inference.py\ + -a "sample_video_lrs3/sxnlvwprf_c-00007.wav"\ + -v "sample_video_lrs3/Li4-1yyrsTI-00010"\ -m "nota_wav2lip"\ - -o "result"\ - --device cpu \ No newline at end of file + -o "result_nota"\ + --device cpu + + \ No newline at end of file diff --git a/preprocess.sh b/preprocess.sh index e9c9d41..df4a52f 100644 --- a/preprocess.sh +++ b/preprocess.sh @@ -1,2 +1,7 @@ +# example for audio source python preprocess.py\ - -i sample_video_lrs3/EV3OmxrowWE-00003.mp4 \ No newline at end of file + -i sample_video_lrs3/sxnlvwprf_c-00007.mp4 + +# example for video source +python preprocess.py\ + -i sample_video_lrs3/Li4-1yyrsTI-00010.mp4 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 2c73619..5dcd1bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ tqdm==4.63.0 lws==1.2.7 omegaconf==2.3.0 yt-dlp==2022.6.22 -loguru==0.7.2 \ No newline at end of file +loguru==0.7.2 From 41ccc330d43eddf2fed0f30e81ccf3aee4758c6d Mon Sep 17 00:00:00 2001 From: bknota Date: Tue, 27 Feb 2024 16:09:28 +0000 Subject: [PATCH 2/5] revise documentation --- Dockerfile | 4 +++- README.md | 20 +++++++++----------- download.py | 2 -- download.sh | 4 +--- inference.sh | 5 +---- requirements.txt | 2 +- 6 files changed, 15 insertions(+), 22 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3d78479..4fc7fdd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,5 +3,7 @@ FROM nvcr.io/nvidia/pytorch:22.03-py3 ARG DEBIAN_FRONTEND=noninteractive RUN apt-get update RUN apt-get install ffmpeg libsm6 libxext6 tmux git -y -RUN pip install --no-cache -r requirements.txt + WORKDIR /workspace +COPY requirements.txt . +RUN pip install --no-cache -r requirements.txt \ No newline at end of file diff --git a/README.md b/README.md index f9b4260..3952366 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ Official codebase for [**Accelerating Speech-Driven Talking Face Generation with ## Installation #### Docker (recommended) ```bash +git clone https://github.com/Nota-NetsPresso/nota-wav2lip.git +cd nota-wav2lip docker compose run --service-ports --name nota-compressed-wav2lip compressed-wav2lip bash ``` @@ -28,13 +30,13 @@ docker compose run --service-ports --name nota-compressed-wav2lip compressed-wav Click ```bash -conda create -n nota-wav2lip python=3.9 -conda activate nota-wav2lip git clone https://github.com/Nota-NetsPresso/nota-wav2lip.git cd nota-wav2lip -pip install -r requirements.txt apt-get update apt-get install ffmpeg libsm6 libxext6 tmux git -y +conda create -n nota-wav2lip python=3.9 +conda activate nota-wav2lip +pip install -r requirements.txt ``` @@ -59,10 +61,11 @@ Use the below script to run the [nota-ai/compressed-wav2lip demo](https://huggin ``` ## License -The purpose of this repository, along with its model weights, is strictly for research and non-commercial projects. +- All rights related to this repository and the compressed models are reserved by Nota Inc. +- The intended use is strictly limited to research and non-commercial projects. ## Contact -- To obtain compression code and assistance, kindly contact Nota AI. These are provided as part of our business solutions (for business inquiries: contact@nota.ai). +- To obtain compression code and assistance, kindly contact Nota AI (contact@nota.ai). These are provided as part of our business solutions. - For Q&A about this repo, use this board: [Nota-NetsPresso/discussions](https://github.com/orgs/Nota-NetsPresso/discussions) ## Acknowledgment @@ -78,9 +81,4 @@ The purpose of this repository, along with its model weights, is strictly for re year={2023}, url={https://arxiv.org/abs/2304.00471} } -``` - - - - - +``` \ No newline at end of file diff --git a/download.py b/download.py index 536638d..ee4d663 100644 --- a/download.py +++ b/download.py @@ -1,8 +1,6 @@ import argparse - from nota_wav2lip.preprocess import get_cropped_face_from_lrs3_label - def parse_args(): parser = argparse.ArgumentParser(description="NotaWav2Lip: Get LRS3 video sample with the label text file") diff --git a/download.sh b/download.sh index c282ce3..4671365 100644 --- a/download.sh +++ b/download.sh @@ -4,6 +4,4 @@ python download.py\ # example for video source python download.py\ - -i data/lrs3_v0.4_txt/lrs3_v0.4/test/Li4S1yyrsTI/00010.txt - - \ No newline at end of file + -i data/lrs3_v0.4_txt/lrs3_v0.4/test/Li4S1yyrsTI/00010.txt \ No newline at end of file diff --git a/inference.sh b/inference.sh index f139104..64cf045 100644 --- a/inference.sh +++ b/inference.sh @@ -1,4 +1,3 @@ - # Original Wav2Lip python inference.py\ -a "sample_video_lrs3/sxnlvwprf_c-00007.wav"\ @@ -13,6 +12,4 @@ python inference.py\ -v "sample_video_lrs3/Li4-1yyrsTI-00010"\ -m "nota_wav2lip"\ -o "result_nota"\ - --device cpu - - \ No newline at end of file + --device cpu \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5dcd1bf..2c73619 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ tqdm==4.63.0 lws==1.2.7 omegaconf==2.3.0 yt-dlp==2022.6.22 -loguru==0.7.2 +loguru==0.7.2 \ No newline at end of file From a08156ea07497853ac4a4643b9b59a2e80c909d4 Mon Sep 17 00:00:00 2001 From: bknota Date: Tue, 27 Feb 2024 16:13:46 +0000 Subject: [PATCH 3/5] add empty data folder (placeholder for lrs3_v0.4) --- data/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 data/.gitkeep diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 From c86362282aab4521d08e11c7cfbfaeaa4d8cda28 Mon Sep 17 00:00:00 2001 From: Hyoung-Kyu Song Date: Thu, 7 Mar 2024 18:23:16 +0900 Subject: [PATCH 4/5] ruff fix --- download.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/download.py b/download.py index ee4d663..536638d 100644 --- a/download.py +++ b/download.py @@ -1,6 +1,8 @@ import argparse + from nota_wav2lip.preprocess import get_cropped_face_from_lrs3_label + def parse_args(): parser = argparse.ArgumentParser(description="NotaWav2Lip: Get LRS3 video sample with the label text file") From 92bf82926c764f5e105f6fa3ecda36215896ddce Mon Sep 17 00:00:00 2001 From: Hyoung-Kyu Song Date: Thu, 7 Mar 2024 18:24:06 +0900 Subject: [PATCH 5/5] disable share server --- app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app.py b/app.py index 1ecf536..517f54e 100644 --- a/app.py +++ b/app.py @@ -102,4 +102,4 @@ gr.Markdown(Path('docs/footer.md').read_text()) - demo.queue().launch(share=True) + demo.queue().launch()