diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml new file mode 100644 index 0000000..e31d81c --- /dev/null +++ b/.github/workflows/jekyll-gh-pages.yml @@ -0,0 +1,51 @@ +# Sample workflow for building and deploying a Jekyll site to GitHub Pages +name: Deploy Jekyll with GitHub Pages dependencies preinstalled + +on: + # Runs on pushes targeting the default branch + push: + branches: ["main"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Build job + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Build with Jekyll + uses: actions/jekyll-build-pages@v1 + with: + source: ./ + destination: ./_site + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + + # Deployment job + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6403698 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +work_dirs/ +__pycache__/ +**.pyc +**.pth +data/ +data +checkpoints/ +envs/ +visualization/ \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..9f0867b --- /dev/null +++ b/README.md @@ -0,0 +1,239 @@ +
+ +![logo](https://github.com/user-attachments/assets/f9351412-d54a-4ac6-9344-d412fe3b3581) + +# Cloud-Adapter + +Cloud Segmentation for Remote Sensing Images. + +

+ + + Python + + PyTorch + + Lightning + + + Config: hydra + + Code style: black + +

+ +

+ View Demo + • + Report Bug + • + Request Feature +

+ + + +
+ +# Introduction + +This repository serves as the official implementation of the paper **"Adapting Vision Foundation Models for Robust Cloud Segmentation in Remote Sensing Images"**. It provides a comprehensive pipeline for semantic segmentation, including data preprocessing, model training, evaluation, and deployment, specifically tailored for cloud segmentation tasks in remote sensing imagery. + +--- + + +## Quick Start + +### 1. Clone the Repository + +```bash +git clone https://github.com/XavierJiezou/Cloud-Adapter.git +cd Cloud-Adapter +``` + +### 2. Install Dependencies + +You can either set up the environment manually or use our pre-configured environment for convenience: + +#### Option 1: Manual Installation + +Ensure you are using Python 3.8 or higher, then install the required dependencies: + +```bash +pip install -r requirements.txt +``` + +#### Option 2: Use Pre-configured Environment + +We provide a pre-configured environment (`envs`) hosted on Hugging Face. You can download it directly from [Hugging Face](https://huggingface.co/XavierJiezou/cloud-adapter-models). Follow the instructions on the page to set up and activate the environment. + +--- + +### 3. Prepare Data + +We have open-sourced all datasets used in the paper, which are hosted on [Hugging Face Datasets](https://huggingface.co/datasets/XavierJiezou/cloud-adapter-datasets). Please follow the instructions on the dataset page to download the data. + +After downloading, organize the dataset as follows: + +``` +Cloud-Adapter +├── ... +├── data +│ ├── cloudsen12_high_l1c +│ │ ├── ann_dir +│ │ │ ├── train +│ │ │ ├── val +│ │ │ ├── test +│ │ ├── img_dir +│ │ │ ├── train +│ │ │ ├── val +│ │ │ ├── test +│ ├── cloudsen12_high_l2a +│ │ ├── ann_dir +│ │ │ ├── train +│ │ │ ├── val +│ │ │ ├── test +│ │ ├── img_dir +│ │ │ ├── train +│ │ │ ├── val +│ │ │ ├── test +│ ├── gf12ms_whu_gf1 +│ │ ├── ann_dir +│ │ │ ├── train +│ │ │ ├── val +│ │ │ ├── test +│ │ ├── img_dir +│ │ │ ├── train +│ │ │ ├── val +│ │ │ ├── test +│ ├── gf12ms_whu_gf2 +│ │ ├── ann_dir +│ │ │ ├── train +│ │ │ ├── val +│ │ │ ├── test +│ │ ├── img_dir +│ │ │ ├── train +│ │ │ ├── val +│ │ │ ├── test +│ ├── hrc_whu +│ │ ├── ann_dir +│ │ │ ├── train +│ │ │ ├── val +│ │ │ ├── test +│ │ ├── img_dir +│ │ │ ├── train +│ │ │ ├── val +│ │ │ ├── test +├── ... +``` + +### 4. Model Weights + +All model weights used in the paper have been open-sourced and are available on [Hugging Face Models](https://huggingface.co/XavierJiezou/cloud-adapter-models). You can download the pretrained models and directly integrate them into your pipeline. + +To use a pretrained model, specify the path to the downloaded weights in your configuration file or command-line arguments. + +--- + +### 5. Train the Model + +We utilize the [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) framework for training. Please ensure you have the MMSegmentation library installed and the configuration file properly set up. + +#### Step 1: Modify the Configuration File + +Update the `configs` directory with your training configuration, or use one of the provided example configurations. You can customize the backbone, dataset paths, and hyperparameters in the configuration file (e.g., `configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all.py`). + +#### Step 2: Start Training + +Use the following command to begin training: + +```bash +CUDA_VISIBLE_DEVICES=0 python tools/train.py configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all.py +``` + +#### Step 3: Resume or Fine-tune + +To resume training from a checkpoint or fine-tune using pretrained weights, run: + +```bash +python tools/train.py configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all.py --resume-from path/to/checkpoint.pth +``` + +### 6. Evaluate the Model + +Use the following command to evaluate the trained model: + +```bash +CUDA_VISIBLE_DEVICES=0 python tools/test.py configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all.py path/to/checkpoint.pth +``` + +#### Special Evaluation: L8_Biome Dataset + +If you want to evaluate the model’s performance on different scenes of the **L8_Biome** dataset, you can run the following script: + +```bash +python tools/eval_l8_scene.py --config configs/to/path.py --checkpoint path/to/checkpoint.pth --img_dir data/l8_biome +``` + +This will automatically evaluate the model across various scenes of the **L8_Biome** dataset, providing detailed performance metrics for each scene. + + +#### Reproducing Paper Comparisons + +If you would like to reproduce the other models and comparisons presented in the paper, please refer to our other repository: [CloudSeg](https://github.com/XavierJiezou/cloudseg). This repository contains the implementation and weights of the other models used for comparison in the study. + +--- + +### 7. Gradio Demo + +We have created a **Gradio** demo to showcase the model's functionality. If you'd like to try it out, follow these steps: + +1. Navigate to the `hugging_face` directory: + +```bash +cd hugging_face +``` + +2. Run the demo: + +```bash +python app.py +``` + +This will start the Gradio interface, where you can upload remote sensing images and visualize the model's segmentation results in real-time. + +#### Troubleshooting + +- If you encounter a `file not found` error, it is likely that the model weights have not been downloaded. Please visit [Hugging Face Models](https://huggingface.co/XavierJiezou/cloud-adapter-models) to download the pretrained model weights. + +- **GPU Requirements**: To run the model on a GPU, you will need at least **16GB** of GPU memory. + +- **Running on CPU**: If you prefer to run the demo on CPU instead of GPU, set the following environment variable before running the demo: + +```bash +export CUDA_VISIBLE_DEVICES=-1 +``` + +## Citation + +If you use our code or models in your research, please cite with: + +```latex +@article{cloud-adapter, + title={Adapting Vision Foundation Models for Robust Cloud Segmentation in Remote Sensing Images}, + author={Xuechao Zou and Shun Zhang and Kai Li and Shiying Wang and Junliang Xing and Lei Jin and Congyan Lang and Pin Tao}, + year={2024}, + eprint={2411.13127}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2411.13127} +} +``` + +## License + +This project is licensed under the [MIT License](LICENSE). diff --git a/cloud_adapter/__init__.py b/cloud_adapter/__init__.py new file mode 100644 index 0000000..eee1970 --- /dev/null +++ b/cloud_adapter/__init__.py @@ -0,0 +1,6 @@ +from .dg_metrics import DGIoUMetric +from .models import * +from .optimizers import * +from .hooks import * +from .utils import * +from .datasets import * \ No newline at end of file diff --git a/cloud_adapter/datasets/__init__.py b/cloud_adapter/datasets/__init__.py new file mode 100644 index 0000000..1507340 --- /dev/null +++ b/cloud_adapter/datasets/__init__.py @@ -0,0 +1,15 @@ +from .hrc_whu import HRCWHUDataset +from .gf12ms_whu_gf1 import GF12MSWHUGF1Dataset +from .gf12ms_whu_gf2 import GF12MSWHUGF2Dataset +from .cloudsen12_high_l1c import CLOUDSEN12HIGHL1CDataset +from .cloudsen12_high_l2a import CLOUDSEN12HIGHL2ADataset +from .l8_biome import L8BIOMEDataset + +__all__ = [ + "HRCWHUDataset", + "GF12MSWHUGF1Dataset", + "GF12MSWHUGF2Dataset", + "CLOUDSEN12HIGHL1CDataset", + "CLOUDSEN12HIGHL2ADataset", + "L8BIOMEDataset", +] diff --git a/cloud_adapter/datasets/cloudsen12_high_l1c.py b/cloud_adapter/datasets/cloudsen12_high_l1c.py new file mode 100644 index 0000000..90c0621 --- /dev/null +++ b/cloud_adapter/datasets/cloudsen12_high_l1c.py @@ -0,0 +1,29 @@ +from mmseg.registry import DATASETS +from mmseg.datasets import BaseSegDataset + + +@DATASETS.register_module() +class CLOUDSEN12HIGHL1CDataset(BaseSegDataset): + METAINFO = dict( + classes=("clear", "thick cloud", "thin cloud", "cloud shadow"), + palette=[ + [0, 0, 0], + [255, 255, 255], + [170, 170, 170], + [85, 85, 85], + ], + ) + + def __init__( + self, + img_suffix=".png", + seg_map_suffix=".png", + reduce_zero_label=False, + **kwargs + ) -> None: + super().__init__( + img_suffix=img_suffix, + seg_map_suffix=seg_map_suffix, + reduce_zero_label=reduce_zero_label, + **kwargs + ) diff --git a/cloud_adapter/datasets/cloudsen12_high_l2a.py b/cloud_adapter/datasets/cloudsen12_high_l2a.py new file mode 100644 index 0000000..bea8e91 --- /dev/null +++ b/cloud_adapter/datasets/cloudsen12_high_l2a.py @@ -0,0 +1,29 @@ +from mmseg.registry import DATASETS +from mmseg.datasets import BaseSegDataset + + +@DATASETS.register_module() +class CLOUDSEN12HIGHL2ADataset(BaseSegDataset): + METAINFO = dict( + classes=("clear", "thick cloud", "thin cloud", "cloud shadow"), + palette=[ + [0, 0, 0], + [255, 255, 255], + [170, 170, 170], + [85, 85, 85], + ], + ) + + def __init__( + self, + img_suffix=".png", + seg_map_suffix=".png", + reduce_zero_label=False, + **kwargs + ) -> None: + super().__init__( + img_suffix=img_suffix, + seg_map_suffix=seg_map_suffix, + reduce_zero_label=reduce_zero_label, + **kwargs + ) diff --git a/cloud_adapter/datasets/gf12ms_whu_gf1.py b/cloud_adapter/datasets/gf12ms_whu_gf1.py new file mode 100644 index 0000000..3874b4d --- /dev/null +++ b/cloud_adapter/datasets/gf12ms_whu_gf1.py @@ -0,0 +1,20 @@ +from mmseg.registry import DATASETS +from mmseg.datasets import BaseSegDataset + + +@DATASETS.register_module() +class GF12MSWHUGF1Dataset(BaseSegDataset): + METAINFO = dict( + classes=('clear sky', 'cloud'), + palette=[[0, 0, 0], [255, 255, 255]]) + + def __init__(self, + img_suffix='.png', + seg_map_suffix='.png', + reduce_zero_label=False, + **kwargs) -> None: + super().__init__( + img_suffix=img_suffix, + seg_map_suffix=seg_map_suffix, + reduce_zero_label=reduce_zero_label, + **kwargs) \ No newline at end of file diff --git a/cloud_adapter/datasets/gf12ms_whu_gf2.py b/cloud_adapter/datasets/gf12ms_whu_gf2.py new file mode 100644 index 0000000..3777104 --- /dev/null +++ b/cloud_adapter/datasets/gf12ms_whu_gf2.py @@ -0,0 +1,20 @@ +from mmseg.registry import DATASETS +from mmseg.datasets import BaseSegDataset + + +@DATASETS.register_module() +class GF12MSWHUGF2Dataset(BaseSegDataset): + METAINFO = dict( + classes=('clear sky', 'cloud'), + palette=[[0, 0, 0], [255, 255, 255]]) + + def __init__(self, + img_suffix='.png', + seg_map_suffix='.png', + reduce_zero_label=False, + **kwargs) -> None: + super().__init__( + img_suffix=img_suffix, + seg_map_suffix=seg_map_suffix, + reduce_zero_label=reduce_zero_label, + **kwargs) \ No newline at end of file diff --git a/cloud_adapter/datasets/hrc_whu.py b/cloud_adapter/datasets/hrc_whu.py new file mode 100644 index 0000000..c782e13 --- /dev/null +++ b/cloud_adapter/datasets/hrc_whu.py @@ -0,0 +1,20 @@ +from mmseg.registry import DATASETS +from mmseg.datasets import BaseSegDataset + + +@DATASETS.register_module() +class HRCWHUDataset(BaseSegDataset): + METAINFO = dict( + classes=('clear sky', 'cloud'), + palette=[[0, 0, 0],[255, 255, 255]]) + + def __init__(self, + img_suffix='.png', + seg_map_suffix='.png', + reduce_zero_label=False, + **kwargs) -> None: + super().__init__( + img_suffix=img_suffix, + seg_map_suffix=seg_map_suffix, + reduce_zero_label=reduce_zero_label, + **kwargs) \ No newline at end of file diff --git a/cloud_adapter/datasets/l8_biome.py b/cloud_adapter/datasets/l8_biome.py new file mode 100644 index 0000000..77474f0 --- /dev/null +++ b/cloud_adapter/datasets/l8_biome.py @@ -0,0 +1,29 @@ +from mmseg.registry import DATASETS +from mmseg.datasets import BaseSegDataset + + +@DATASETS.register_module() +class L8BIOMEDataset(BaseSegDataset): + METAINFO = dict( + classes=("Clear", "Cloud Shadow", "Thin Cloud", "Cloud"), + palette=[ + [0, 0, 0], + [85, 85, 85], + [170, 170, 170], + [255, 255, 255], + ], + ) + + def __init__( + self, + img_suffix=".png", + seg_map_suffix=".png", + reduce_zero_label=False, + **kwargs + ) -> None: + super().__init__( + img_suffix=img_suffix, + seg_map_suffix=seg_map_suffix, + reduce_zero_label=reduce_zero_label, + **kwargs + ) diff --git a/cloud_adapter/dg_metrics.py b/cloud_adapter/dg_metrics.py new file mode 100644 index 0000000..b0f59db --- /dev/null +++ b/cloud_adapter/dg_metrics.py @@ -0,0 +1,89 @@ +import os.path as osp +from typing import Dict, Sequence + +import numpy as np +from mmengine.logging import MMLogger, print_log +from PIL import Image + +from mmseg.registry import METRICS +from mmseg.evaluation.metrics.iou_metric import IoUMetric +from collections import defaultdict + + +@METRICS.register_module() +class DGIoUMetric(IoUMetric): + def __init__(self, dataset_keys=[], mean_used_keys=[], **kwargs): + super().__init__(**kwargs) + self.dataset_keys = dataset_keys + if mean_used_keys: + self.mean_used_keys = mean_used_keys + else: + self.mean_used_keys = dataset_keys + + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data and data_samples. + + The processed results should be stored in ``self.results``, which will + be used to compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. + """ + num_classes = len(self.dataset_meta["classes"]) + for data_sample in data_samples: + pred_label = data_sample["pred_sem_seg"]["data"].squeeze() + # format_only always for test dataset without ground truth + if not self.format_only: + label = data_sample["gt_sem_seg"]["data"].squeeze().to(pred_label) + res1, res2, res3, res4 = self.intersect_and_union( + pred_label, label, num_classes, self.ignore_index + ) + dataset_key = "unknown" + for key in self.dataset_keys: + if key in data_samples[0]["seg_map_path"]: + dataset_key = key + break + self.results.append([dataset_key, res1, res2, res3, res4]) + # format_result + if self.output_dir is not None: + basename = osp.splitext(osp.basename(data_sample["img_path"]))[0] + png_filename = osp.abspath(osp.join(self.output_dir, f"{basename}.png")) + output_mask = pred_label.cpu().numpy() + # The index range of official ADE20k dataset is from 0 to 150. + # But the index range of output is from 0 to 149. + # That is because we set reduce_zero_label=True. + if data_sample.get("reduce_zero_label", False): + output_mask = output_mask + 1 + output = Image.fromarray(output_mask.astype(np.uint8)) + output.save(png_filename) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. The key + mainly includes aAcc, mIoU, mAcc, mDice, mFscore, mPrecision, + mRecall. + """ + dataset_results = defaultdict(list) + metrics = {} + for result in results: + dataset_results[result[0]].append(result[1:]) + metrics_type2mean = defaultdict(list) + for key, key_result in dataset_results.items(): + logger: MMLogger = MMLogger.get_current_instance() + print_log(f"----------metrics for {key}------------", logger) + key_metrics = super().compute_metrics(key_result) + print_log(f"number of samples for {key}: {len(key_result)}") + for k, v in key_metrics.items(): + metrics[f"{key}_{k}"] = v + if key in self.mean_used_keys: + metrics_type2mean[k].append(v) + for k, v in metrics_type2mean.items(): + metrics[f"mean_{k}"] = sum(v) / len(v) + return metrics diff --git a/cloud_adapter/hooks/__init__.py b/cloud_adapter/hooks/__init__.py new file mode 100644 index 0000000..0df5090 --- /dev/null +++ b/cloud_adapter/hooks/__init__.py @@ -0,0 +1 @@ +from .load_backbone_hook import LoadBackboneHook diff --git a/cloud_adapter/hooks/load_backbone_hook.py b/cloud_adapter/hooks/load_backbone_hook.py new file mode 100644 index 0000000..9708620 --- /dev/null +++ b/cloud_adapter/hooks/load_backbone_hook.py @@ -0,0 +1,23 @@ +from mmseg.registry import HOOKS +from mmengine.hooks import Hook +from mmengine.runner.checkpoint import _load_checkpoint +def load_backbone(checkpoint,backbone_weight_path): + converted_backbone_weight = _load_checkpoint( + backbone_weight_path, map_location="cpu" + ) + if "state_dict" in checkpoint: + checkpoint["state_dict"].update( + {f"backbone.{k}": v for k, v in converted_backbone_weight.items()} + ) + else: + checkpoint.update( + {f"backbone.{k}": v for k, v in converted_backbone_weight.items()} + ) + +@HOOKS.register_module() +class LoadBackboneHook(Hook): + def __init__(self, checkpoint_path) -> None: + self.checkpoint_path = checkpoint_path + + def after_load_checkpoint(self, runner, checkpoint: dict) -> None: + load_backbone(checkpoint,self.checkpoint_path) diff --git a/cloud_adapter/models/__init__.py b/cloud_adapter/models/__init__.py new file mode 100644 index 0000000..8e3fb98 --- /dev/null +++ b/cloud_adapter/models/__init__.py @@ -0,0 +1,2 @@ +from .backbones import * +from .segmentors import * diff --git a/cloud_adapter/models/backbones/__init__.py b/cloud_adapter/models/backbones/__init__.py new file mode 100644 index 0000000..9297ca1 --- /dev/null +++ b/cloud_adapter/models/backbones/__init__.py @@ -0,0 +1,28 @@ +from .dino_v2 import DinoVisionTransformer +from .reins_dinov2 import ReinsDinoVisionTransformer +from .reins_eva_02 import ReinsEVA2 +from .reins_resnet import ReinsResNetV1c +from .my_rein_dinov2 import MyReinsDinoVisionTransformer +from .myreinstoken import MyReinsToken +from .rein_token_divo2 import ReinsTokenDinoVisionTransformer +from .myrein_tonken_mlp import MyReinsTokenMlp +from .my_rein_token_mlp_dinov2 import MyReinTokenDinoVisionTransformer +from .loracacheadapter import LoRACacheAdapter +from .vitadapter_dinov2 import ViTAdapter +from .cnnadapter import CNNAdapter +from .cnnadapter_dinov2 import CNNAdapterDinoVisionTransformer +from .pmaaadapter import PMAAAdapter +from .pmaaadapter_dinov2 import PMAAAdapterDinoVisionTransformer +from .cloud_adapter import CloudAdapter +from .cloud_adapter_dinov2 import CloudAdapterDinoVisionTransformer +try: + from .reins_convnext import ReinsConvNeXt +except: + print('Fail to import ReinsConvNeXt, if you need to use it, please install mmpretrain') +from .clip import CLIPVisionTransformer +from .reins_sam_vit import ReinsSAMViT +from .sam_vit import SAMViT +from .reins_clip import ReinsCLIPVisionTransformer +from .convnext_dinov2 import ConvnextDinoVisionTransformer +from .loracacheadapter_dinov2 import LoRACacheAdapterDinoVisionTransformer +from .cloud_adapter_sam import CloudAdapterSamVisionTransformer \ No newline at end of file diff --git a/cloud_adapter/models/backbones/adapter_modules.py b/cloud_adapter/models/backbones/adapter_modules.py new file mode 100644 index 0000000..e64ff82 --- /dev/null +++ b/cloud_adapter/models/backbones/adapter_modules.py @@ -0,0 +1,298 @@ +import logging +from functools import partial + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +# from ops.modules import MSDeformAttn +from mmcv.ops import MultiScaleDeformableAttention as MSDeformAttn +from timm.models.layers import DropPath + +_logger = logging.getLogger(__name__) + + +def get_reference_points(spatial_shapes, device): + reference_points_list = [] + for lvl, (H_, W_) in enumerate(spatial_shapes): + ref_y, ref_x = torch.meshgrid( + torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), + torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device)) + ref_y = ref_y.reshape(-1)[None] / H_ + ref_x = ref_x.reshape(-1)[None] / W_ + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] + return reference_points + + +def deform_inputs(x): + bs, c, h, w = x.shape + spatial_shapes = torch.as_tensor([(h // 8, w // 8), + (h // 16, w // 16), + (h // 32, w // 32)], + dtype=torch.long, device=x.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + reference_points = get_reference_points([(h // 16, w // 16)], x.device) + deform_inputs1 = [reference_points, spatial_shapes, level_start_index] + + spatial_shapes = torch.as_tensor([(h // 16, w // 16)], dtype=torch.long, device=x.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + reference_points = get_reference_points([(h // 8, w // 8), + (h // 16, w // 16), + (h // 32, w // 32)], x.device) + deform_inputs2 = [reference_points, spatial_shapes, level_start_index] + + return deform_inputs1, deform_inputs2 + + +class ConvFFN(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, + act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.dwconv = DWConv(hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x, H, W): + x = self.fc1(x) + x = self.dwconv(x, H, W) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class DWConv(nn.Module): + def __init__(self, dim=768): + super().__init__() + self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) + + def forward(self, x, H, W): + B, N, C = x.shape + n = N // 21 + x1 = x[:, 0:16 * n, :].transpose(1, 2).view(B, C, H * 2, W * 2).contiguous() + x2 = x[:, 16 * n:20 * n, :].transpose(1, 2).view(B, C, H, W).contiguous() + x3 = x[:, 20 * n:, :].transpose(1, 2).view(B, C, H // 2, W // 2).contiguous() + x1 = self.dwconv(x1).flatten(2).transpose(1, 2) + x2 = self.dwconv(x2).flatten(2).transpose(1, 2) + x3 = self.dwconv(x3).flatten(2).transpose(1, 2) + x = torch.cat([x1, x2, x3], dim=1) + return x + + +class Extractor(nn.Module): + def __init__(self, dim, num_heads=6, n_points=4, n_levels=1, deform_ratio=1.0, + with_cffn=True, cffn_ratio=0.25, drop=0., drop_path=0., + norm_layer=partial(nn.LayerNorm, eps=1e-6), with_cp=False): + super().__init__() + self.query_norm = norm_layer(dim) + self.feat_norm = norm_layer(dim) + self.attn = MSDeformAttn(embed_dims=dim, num_levels=n_levels, num_heads=num_heads, + num_points=n_points) + self.with_cffn = with_cffn + self.with_cp = with_cp + if with_cffn: + self.ffn = ConvFFN(in_features=dim, hidden_features=int(dim * cffn_ratio), drop=drop) + self.ffn_norm = norm_layer(dim) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, query, reference_points, feat, spatial_shapes, level_start_index, H, W): + + def _inner_forward(query, feat): + + # attn = self.attn(self.query_norm(query), reference_points, + # self.feat_norm(feat), spatial_shapes, + # level_start_index, None) + # query = query + attn + + if self.with_cffn: + query = query + self.drop_path(self.ffn(self.ffn_norm(query), H, W)) + return query + + if self.with_cp and query.requires_grad: + query = cp.checkpoint(_inner_forward, query, feat) + else: + query = _inner_forward(query, feat) + + return query + + +class Injector(nn.Module): + def __init__(self, dim, num_heads=6, n_points=4, n_levels=1, deform_ratio=1.0, + norm_layer=partial(nn.LayerNorm, eps=1e-6), init_values=0., with_cp=False): + super().__init__() + self.with_cp = with_cp + self.query_norm = norm_layer(dim) + self.feat_norm = norm_layer(dim) + self.attn = MSDeformAttn(embed_dims=dim, num_levels=n_levels, num_heads=num_heads, + num_points=n_points) + self.gamma = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) + + def forward(self, query, reference_points, feat, spatial_shapes, level_start_index): + + def _inner_forward(query, feat): + + # attn = self.attn(self.query_norm(query), reference_points, + # self.feat_norm(feat), spatial_shapes, + # level_start_index, None) + return query + # return query + self.gamma * attn + + if self.with_cp and query.requires_grad: + query = cp.checkpoint(_inner_forward, query, feat) + else: + query = _inner_forward(query, feat) + + return query + + +class InteractionBlock(nn.Module): + def __init__(self, dim, num_heads=6, n_points=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), + drop=0., drop_path=0., with_cffn=True, cffn_ratio=0.25, init_values=0., + deform_ratio=1.0, extra_extractor=False, with_cp=False): + super().__init__() + + self.injector = Injector(dim=dim, n_levels=3, num_heads=num_heads, init_values=init_values, + n_points=n_points, norm_layer=norm_layer, deform_ratio=deform_ratio, + with_cp=with_cp) + self.extractor = Extractor(dim=dim, n_levels=1, num_heads=num_heads, n_points=n_points, + norm_layer=norm_layer, deform_ratio=deform_ratio, with_cffn=with_cffn, + cffn_ratio=cffn_ratio, drop=drop, drop_path=drop_path, with_cp=with_cp) + if extra_extractor: + self.extra_extractors = nn.Sequential(*[ + Extractor(dim=dim, num_heads=num_heads, n_points=n_points, norm_layer=norm_layer, + with_cffn=with_cffn, cffn_ratio=cffn_ratio, deform_ratio=deform_ratio, + drop=drop, drop_path=drop_path, with_cp=with_cp) + for _ in range(2) + ]) + else: + self.extra_extractors = None + + def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W): + x = self.injector(query=x, reference_points=deform_inputs1[0], + feat=c, spatial_shapes=deform_inputs1[1], + level_start_index=deform_inputs1[2]) + for idx, blk in enumerate(blocks): + x = blk(x) + c = self.extractor(query=c, reference_points=deform_inputs2[0], + feat=x, spatial_shapes=deform_inputs2[1], + level_start_index=deform_inputs2[2], H=H, W=W) + if self.extra_extractors is not None: + for extractor in self.extra_extractors: + c = extractor(query=c, reference_points=deform_inputs2[0], + feat=x, spatial_shapes=deform_inputs2[1], + level_start_index=deform_inputs2[2], H=H, W=W) + return x, c + + +class InteractionBlockWithCls(nn.Module): + def __init__(self, dim, num_heads=6, n_points=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), + drop=0., drop_path=0., with_cffn=True, cffn_ratio=0.25, init_values=0., + deform_ratio=1.0, extra_extractor=False, with_cp=False): + super().__init__() + + self.injector = Injector(dim=dim, n_levels=3, num_heads=num_heads, init_values=init_values, + n_points=n_points, norm_layer=norm_layer, deform_ratio=deform_ratio, + with_cp=with_cp) + self.extractor = Extractor(dim=dim, n_levels=1, num_heads=num_heads, n_points=n_points, + norm_layer=norm_layer, deform_ratio=deform_ratio, with_cffn=with_cffn, + cffn_ratio=cffn_ratio, drop=drop, drop_path=drop_path, with_cp=with_cp) + if extra_extractor: + self.extra_extractors = nn.Sequential(*[ + Extractor(dim=dim, num_heads=num_heads, n_points=n_points, norm_layer=norm_layer, + with_cffn=with_cffn, cffn_ratio=cffn_ratio, deform_ratio=deform_ratio, + drop=drop, drop_path=drop_path, with_cp=with_cp) + for _ in range(2) + ]) + else: + self.extra_extractors = None + + def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H, W): + x = self.injector(query=x, reference_points=deform_inputs1[0], + feat=c, spatial_shapes=deform_inputs1[1], + level_start_index=deform_inputs1[2]) + x = torch.cat((cls, x), dim=1) + for idx, blk in enumerate(blocks): + x = blk(x, H, W) + cls, x = x[:, :1, ], x[:, 1:, ] + c = self.extractor(query=c, reference_points=deform_inputs2[0], + feat=x, spatial_shapes=deform_inputs2[1], + level_start_index=deform_inputs2[2], H=H, W=W) + if self.extra_extractors is not None: + for extractor in self.extra_extractors: + c = extractor(query=c, reference_points=deform_inputs2[0], + feat=x, spatial_shapes=deform_inputs2[1], + level_start_index=deform_inputs2[2], H=H, W=W) + return x, c, cls + + +class SpatialPriorModule(nn.Module): + def __init__(self, inplanes=64, embed_dim=384, with_cp=False): + super().__init__() + self.with_cp = with_cp + + self.stem = nn.Sequential(*[ + nn.Conv2d(3, inplanes, kernel_size=3, stride=2, padding=1, bias=False), + nn.SyncBatchNorm(inplanes), + nn.ReLU(inplace=True), + nn.Conv2d(inplanes, inplanes, kernel_size=3, stride=1, padding=1, bias=False), + nn.SyncBatchNorm(inplanes), + nn.ReLU(inplace=True), + nn.Conv2d(inplanes, inplanes, kernel_size=3, stride=1, padding=1, bias=False), + nn.SyncBatchNorm(inplanes), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + ]) + self.conv2 = nn.Sequential(*[ + nn.Conv2d(inplanes, 2 * inplanes, kernel_size=3, stride=2, padding=1, bias=False), + nn.SyncBatchNorm(2 * inplanes), + nn.ReLU(inplace=True) + ]) + self.conv3 = nn.Sequential(*[ + nn.Conv2d(2 * inplanes, 4 * inplanes, kernel_size=3, stride=2, padding=1, bias=False), + nn.SyncBatchNorm(4 * inplanes), + nn.ReLU(inplace=True) + ]) + self.conv4 = nn.Sequential(*[ + nn.Conv2d(4 * inplanes, 4 * inplanes, kernel_size=3, stride=2, padding=1, bias=False), + nn.SyncBatchNorm(4 * inplanes), + nn.ReLU(inplace=True) + ]) + self.fc1 = nn.Conv2d(inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True) + self.fc2 = nn.Conv2d(2 * inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True) + self.fc3 = nn.Conv2d(4 * inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True) + self.fc4 = nn.Conv2d(4 * inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True) + + def forward(self, x): + + def _inner_forward(x): + c1 = self.stem(x) + c2 = self.conv2(c1) + c3 = self.conv3(c2) + c4 = self.conv4(c3) + c1 = self.fc1(c1) + c2 = self.fc2(c2) + c3 = self.fc3(c3) + c4 = self.fc4(c4) + + bs, dim, _, _ = c1.shape + # c1 = c1.view(bs, dim, -1).transpose(1, 2) # 4s + c2 = c2.view(bs, dim, -1).transpose(1, 2) # 8s + c3 = c3.view(bs, dim, -1).transpose(1, 2) # 16s + c4 = c4.view(bs, dim, -1).transpose(1, 2) # 32s + + return c1, c2, c3, c4 + + if self.with_cp and x.requires_grad: + outs = cp.checkpoint(_inner_forward, x) + else: + outs = _inner_forward(x) + return outs \ No newline at end of file diff --git a/cloud_adapter/models/backbones/beit.py b/cloud_adapter/models/backbones/beit.py new file mode 100644 index 0000000..d1260d8 --- /dev/null +++ b/cloud_adapter/models/backbones/beit.py @@ -0,0 +1,1178 @@ +# -------------------------------------------------------- +# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254) +# Github source: https://github.com/microsoft/unilm/tree/master/beit +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# By Hangbo Bao +# Based on timm, mmseg, setr, xcit and swin code bases +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/fudan-zvg/SETR +# https://github.com/facebookresearch/xcit/ +# https://github.com/microsoft/Swin-Transformer +# --------------------------------------------------------' +import math +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from mmseg.models.builder import BACKBONES + +from mmengine.logging import MMLogger +from timm.models.layers import drop_path, to_2tuple, trunc_normal_ + +# Copyright (c) Open-MMLab. All rights reserved. +import io +import math +import os +import os.path as osp +import pkgutil +import time +import warnings +from collections import OrderedDict +from importlib import import_module +from tempfile import TemporaryDirectory + +import mmcv +import numpy as np +import torch +import torchvision +from mmengine.fileio import FileClient +from mmengine.fileio import load as load_file +from mmengine.dist import get_dist_info +from mmengine.model import is_model_wrapper +from mmengine import mkdir_or_exist +from scipy import interpolate +from torch.nn import functional as F +from torch.optim import Optimizer +from torch.utils import model_zoo + +ENV_MMCV_HOME = "MMCV_HOME" +ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME" +DEFAULT_CACHE_DIR = "~/.cache" + + +def _get_mmcv_home(): + mmcv_home = os.path.expanduser( + os.getenv( + ENV_MMCV_HOME, + os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "mmcv"), + ) + ) + + mkdir_or_exist(mmcv_home) + return mmcv_home + + +def load_state_dict(module, state_dict, strict=False, logger=None): + """Load state_dict to a module. + + This method is modified from :meth:`torch.nn.Module.load_state_dict`. + Default value for ``strict`` is set to ``False`` and the message for + param mismatch will be shown even if strict is False. + Args: + module (Module): Module that receives the state_dict. + state_dict (OrderedDict): Weights. + strict (bool): whether to strictly enforce that the keys + in :attr:`state_dict` match the keys returned by this module's + :meth:`~torch.nn.Module.state_dict` function. Default: ``False``. + logger (:obj:`logging.Logger`, optional): Logger to log the error + message. If not specified, print function will be used. + """ + unexpected_keys = [] + all_missing_keys = [] + err_msg = [] + + metadata = getattr(state_dict, "_metadata", None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + # use _load_from_state_dict to enable checkpoint version control + def load(module, prefix=""): + # recursively check parallel module in case that the model has a + # complicated structure, e.g., nn.Module(nn.Module(DDP)) + if is_model_wrapper(module): + module = module.module + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + module._load_from_state_dict( + state_dict, + prefix, + local_metadata, + True, + all_missing_keys, + unexpected_keys, + err_msg, + ) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + ".") + + load(module) + load = None # break load->load reference cycle + + # ignore "num_batches_tracked" of BN layers + missing_keys = [key for key in all_missing_keys if "num_batches_tracked" not in key] + + if unexpected_keys: + err_msg.append( + "unexpected key in source " f'state_dict: {", ".join(unexpected_keys)}\n' + ) + if missing_keys: + err_msg.append( + f'missing keys in source state_dict: {", ".join(missing_keys)}\n' + ) + + rank, _ = get_dist_info() + if len(err_msg) > 0 and rank == 0: + err_msg.insert(0, "The model and loaded state dict do not match exactly\n") + err_msg = "\n".join(err_msg) + if strict: + raise RuntimeError(err_msg) + elif logger is not None: + logger.warning(err_msg) + else: + print(err_msg) + + +def load_url_dist(url, model_dir=None, map_location="cpu"): + """In distributed setting, this function only download checkpoint at local + rank 0.""" + rank, world_size = get_dist_info() + rank = int(os.environ.get("LOCAL_RANK", rank)) + if rank == 0: + checkpoint = model_zoo.load_url( + url, model_dir=model_dir, map_location=map_location + ) + if world_size > 1: + torch.distributed.barrier() + if rank > 0: + checkpoint = model_zoo.load_url( + url, model_dir=model_dir, map_location=map_location + ) + return checkpoint + + +def load_pavimodel_dist(model_path, map_location=None): + """In distributed setting, this function only download checkpoint at local + rank 0.""" + try: + from pavi import modelcloud + except ImportError: + raise ImportError("Please install pavi to load checkpoint from modelcloud.") + rank, world_size = get_dist_info() + rank = int(os.environ.get("LOCAL_RANK", rank)) + if rank == 0: + model = modelcloud.get(model_path) + with TemporaryDirectory() as tmp_dir: + downloaded_file = osp.join(tmp_dir, model.name) + model.download(downloaded_file) + checkpoint = torch.load(downloaded_file, map_location=map_location) + if world_size > 1: + torch.distributed.barrier() + if rank > 0: + model = modelcloud.get(model_path) + with TemporaryDirectory() as tmp_dir: + downloaded_file = osp.join(tmp_dir, model.name) + model.download(downloaded_file) + checkpoint = torch.load(downloaded_file, map_location=map_location) + return checkpoint + + +def load_fileclient_dist(filename, backend, map_location): + """In distributed setting, this function only download checkpoint at local + rank 0.""" + rank, world_size = get_dist_info() + rank = int(os.environ.get("LOCAL_RANK", rank)) + allowed_backends = ["ceph"] + if backend not in allowed_backends: + raise ValueError(f"Load from Backend {backend} is not supported.") + if rank == 0: + fileclient = FileClient(backend=backend) + buffer = io.BytesIO(fileclient.get(filename)) + checkpoint = torch.load(buffer, map_location=map_location) + if world_size > 1: + torch.distributed.barrier() + if rank > 0: + fileclient = FileClient(backend=backend) + buffer = io.BytesIO(fileclient.get(filename)) + checkpoint = torch.load(buffer, map_location=map_location) + return checkpoint + + +def get_torchvision_models(): + model_urls = dict() + for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__): + if ispkg: + continue + _zoo = import_module(f"torchvision.models.{name}") + if hasattr(_zoo, "model_urls"): + _urls = getattr(_zoo, "model_urls") + model_urls.update(_urls) + return model_urls + + +def get_external_models(): + mmcv_home = _get_mmcv_home() + default_json_path = osp.join(mmcv.__path__[0], "model_zoo/open_mmlab.json") + default_urls = load_file(default_json_path) + assert isinstance(default_urls, dict) + external_json_path = osp.join(mmcv_home, "open_mmlab.json") + if osp.exists(external_json_path): + external_urls = load_file(external_json_path) + assert isinstance(external_urls, dict) + default_urls.update(external_urls) + + return default_urls + + +def get_mmcls_models(): + mmcls_json_path = osp.join(mmcv.__path__[0], "model_zoo/mmcls.json") + mmcls_urls = load_file(mmcls_json_path) + + return mmcls_urls + + +def get_deprecated_model_names(): + deprecate_json_path = osp.join(mmcv.__path__[0], "model_zoo/deprecated.json") + deprecate_urls = load_file(deprecate_json_path) + assert isinstance(deprecate_urls, dict) + + return deprecate_urls + + +def _process_mmcls_checkpoint(checkpoint): + state_dict = checkpoint["state_dict"] + new_state_dict = OrderedDict() + for k, v in state_dict.items(): + if k.startswith("backbone."): + new_state_dict[k[9:]] = v + new_checkpoint = dict(state_dict=new_state_dict) + + return new_checkpoint + + +def _load_checkpoint(filename, map_location=None): + """Load checkpoint from somewhere (modelzoo, file, url). + + Args: + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str | None): Same as :func:`torch.load`. Default: None. + Returns: + dict | OrderedDict: The loaded checkpoint. It can be either an + OrderedDict storing model weights or a dict containing other + information, which depends on the checkpoint. + """ + if filename.startswith("modelzoo://"): + warnings.warn( + 'The URL scheme of "modelzoo://" is deprecated, please ' + 'use "torchvision://" instead' + ) + model_urls = get_torchvision_models() + model_name = filename[11:] + checkpoint = load_url_dist(model_urls[model_name]) + elif filename.startswith("torchvision://"): + model_urls = get_torchvision_models() + model_name = filename[14:] + checkpoint = load_url_dist(model_urls[model_name]) + elif filename.startswith("open-mmlab://"): + model_urls = get_external_models() + model_name = filename[13:] + deprecated_urls = get_deprecated_model_names() + if model_name in deprecated_urls: + warnings.warn( + f"open-mmlab://{model_name} is deprecated in favor " + f"of open-mmlab://{deprecated_urls[model_name]}" + ) + model_name = deprecated_urls[model_name] + model_url = model_urls[model_name] + # check if is url + if model_url.startswith(("http://", "https://")): + checkpoint = load_url_dist(model_url) + else: + filename = osp.join(_get_mmcv_home(), model_url) + if not osp.isfile(filename): + raise IOError(f"{filename} is not a checkpoint file") + checkpoint = torch.load(filename, map_location=map_location) + elif filename.startswith("mmcls://"): + model_urls = get_mmcls_models() + model_name = filename[8:] + checkpoint = load_url_dist(model_urls[model_name]) + checkpoint = _process_mmcls_checkpoint(checkpoint) + elif filename.startswith(("http://", "https://")): + checkpoint = load_url_dist(filename) + elif filename.startswith("pavi://"): + model_path = filename[7:] + checkpoint = load_pavimodel_dist(model_path, map_location=map_location) + elif filename.startswith("s3://"): + checkpoint = load_fileclient_dist( + filename, backend="ceph", map_location=map_location + ) + else: + if not osp.isfile(filename): + raise IOError(f"{filename} is not a checkpoint file") + checkpoint = torch.load(filename, map_location=map_location) + return checkpoint + + +def cosine_scheduler( + base_value, + final_value, + epochs, + niter_per_ep, + warmup_epochs=0, + start_warmup_value=0, + warmup_steps=-1, +): + warmup_schedule = np.array([]) + warmup_iters = warmup_epochs * niter_per_ep + if warmup_steps > 0: + warmup_iters = warmup_steps + print("Set warmup steps = %d" % warmup_iters) + if warmup_epochs > 0: + warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters) + + iters = np.arange(epochs * niter_per_ep - warmup_iters) + schedule = np.array( + [ + final_value + + 0.5 + * (base_value - final_value) + * (1 + math.cos(math.pi * i / (len(iters)))) + for i in iters + ] + ) + + schedule = np.concatenate((warmup_schedule, schedule)) + + assert len(schedule) == epochs * niter_per_ep + return schedule + + +def load_checkpoint(model, filename, map_location="cpu", strict=False, logger=None): + """Load checkpoint from a file or URI. + + Args: + model (Module): Module to load checkpoint. + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str): Same as :func:`torch.load`. + strict (bool): Whether to allow different params for the model and + checkpoint. + logger (:mod:`logging.Logger` or None): The logger for error message. + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + checkpoint = _load_checkpoint(filename, map_location) + # OrderedDict is a subclass of dict + if not isinstance(checkpoint, dict): + raise RuntimeError(f"No state_dict found in checkpoint file {filename}") + # get state_dict from checkpoint + if "state_dict" in checkpoint: + state_dict = checkpoint["state_dict"] + elif "model" in checkpoint: + state_dict = checkpoint["model"] + elif "module" in checkpoint: + state_dict = checkpoint["module"] + else: + state_dict = checkpoint + # strip prefix of state_dict + if list(state_dict.keys())[0].startswith("module."): + state_dict = {k[7:]: v for k, v in state_dict.items()} + + # for MoBY, load model of online branch + if sorted(list(state_dict.keys()))[0].startswith("encoder"): + state_dict = { + k.replace("encoder.", ""): v + for k, v in state_dict.items() + if k.startswith("encoder.") + } + + # reshape absolute position embedding for Swin + if state_dict.get("absolute_pos_embed") is not None: + absolute_pos_embed = state_dict["absolute_pos_embed"] + N1, L, C1 = absolute_pos_embed.size() + N2, C2, H, W = model.absolute_pos_embed.size() + if N1 != N2 or C1 != C2 or L != H * W: + logger.warning("Error in loading absolute_pos_embed, pass") + else: + state_dict["absolute_pos_embed"] = absolute_pos_embed.view( + N2, H, W, C2 + ).permute(0, 3, 1, 2) + + rank, _ = get_dist_info() + if "rel_pos_bias.relative_position_bias_table" in state_dict: + if rank == 0: + print("Expand the shared relative position embedding to each layers. ") + num_layers = model.get_num_layers() + rel_pos_bias = state_dict["rel_pos_bias.relative_position_bias_table"] + for i in range(num_layers): + state_dict[ + "blocks.%d.attn.relative_position_bias_table" % i + ] = rel_pos_bias.clone() + + state_dict.pop("rel_pos_bias.relative_position_bias_table") + + all_keys = list(state_dict.keys()) + for key in all_keys: + if "relative_position_index" in key: + state_dict.pop(key) + + if "relative_position_bias_table" in key: + rel_pos_bias = state_dict[key] + src_num_pos, num_attn_heads = rel_pos_bias.size() + dst_num_pos, _ = model.state_dict()[key].size() + dst_patch_shape = model.patch_embed.patch_shape + if dst_patch_shape[0] != dst_patch_shape[1]: + raise NotImplementedError() + num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * ( + dst_patch_shape[1] * 2 - 1 + ) + src_size = int((src_num_pos - num_extra_tokens) ** 0.5) + dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5) + if src_size != dst_size: + if rank == 0: + print( + "Position interpolate for %s from %dx%d to %dx%d" + % (key, src_size, src_size, dst_size, dst_size) + ) + extra_tokens = rel_pos_bias[-num_extra_tokens:, :] + rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] + + def geometric_progression(a, r, n): + return a * (1.0 - r**n) / (1.0 - r) + + left, right = 1.01, 1.5 + while right - left > 1e-6: + q = (left + right) / 2.0 + gp = geometric_progression(1, q, src_size // 2) + if gp > dst_size // 2: + right = q + else: + left = q + + # if q > 1.13492: + # q = 1.13492 + + dis = [] + cur = 1 + for i in range(src_size // 2): + dis.append(cur) + cur += q ** (i + 1) + + r_ids = [-_ for _ in reversed(dis)] + + x = r_ids + [0] + dis + y = r_ids + [0] + dis + + t = dst_size // 2.0 + dx = np.arange(-t, t + 0.1, 1.0) + dy = np.arange(-t, t + 0.1, 1.0) + if rank == 0: + print("x = {}".format(x)) + print("dx = {}".format(dx)) + + all_rel_pos_bias = [] + + for i in range(num_attn_heads): + z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() + f = interpolate.interp2d(x, y, z, kind="cubic") + all_rel_pos_bias.append( + torch.Tensor(f(dx, dy)) + .contiguous() + .view(-1, 1) + .to(rel_pos_bias.device) + ) + + rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) + new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0) + state_dict[key] = new_rel_pos_bias + + if "pos_embed" in state_dict: + pos_embed_checkpoint = state_dict["pos_embed"] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches**0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + if rank == 0: + print( + "Position interpolate from %dx%d to %dx%d" + % (orig_size, orig_size, new_size, new_size) + ) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape( + -1, orig_size, orig_size, embedding_size + ).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, + size=(new_size, new_size), + mode="bicubic", + align_corners=False, + ) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + state_dict["pos_embed"] = new_pos_embed + + # interpolate position bias table if needed + relative_position_bias_table_keys = [ + k for k in state_dict.keys() if "relative_position_bias_table" in k + ] + for table_key in relative_position_bias_table_keys: + table_pretrained = state_dict[table_key] + table_current = model.state_dict()[table_key] + L1, nH1 = table_pretrained.size() + L2, nH2 = table_current.size() + if nH1 != nH2: + logger.warning(f"Error in loading {table_key}, pass") + else: + if L1 != L2: + S1 = int(L1**0.5) + S2 = int(L2**0.5) + table_pretrained_resized = F.interpolate( + table_pretrained.permute(1, 0).view(1, nH1, S1, S1), + size=(S2, S2), + mode="bicubic", + ) + state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute( + 1, 0 + ) + + # load state_dict + load_state_dict(model, state_dict, strict, logger) + return checkpoint + + +def weights_to_cpu(state_dict): + """Copy a model state_dict to cpu. + + Args: + state_dict (OrderedDict): Model weights on GPU. + Returns: + OrderedDict: Model weights on GPU. + """ + state_dict_cpu = OrderedDict() + for key, val in state_dict.items(): + state_dict_cpu[key] = val.cpu() + return state_dict_cpu + + +def _save_to_state_dict(module, destination, prefix, keep_vars): + """Saves module state to `destination` dictionary. + + This method is modified from :meth:`torch.nn.Module._save_to_state_dict`. + Args: + module (nn.Module): The module to generate state_dict. + destination (dict): A dict where state will be stored. + prefix (str): The prefix for parameters and buffers used in this + module. + """ + for name, param in module._parameters.items(): + if param is not None: + destination[prefix + name] = param if keep_vars else param.detach() + for name, buf in module._buffers.items(): + # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d + if buf is not None: + destination[prefix + name] = buf if keep_vars else buf.detach() + + +def get_state_dict(module, destination=None, prefix="", keep_vars=False): + """Returns a dictionary containing a whole state of the module. + + Both parameters and persistent buffers (e.g. running averages) are + included. Keys are corresponding parameter and buffer names. + This method is modified from :meth:`torch.nn.Module.state_dict` to + recursively check parallel module in case that the model has a complicated + structure, e.g., nn.Module(nn.Module(DDP)). + Args: + module (nn.Module): The module to generate state_dict. + destination (OrderedDict): Returned dict for the state of the + module. + prefix (str): Prefix of the key. + keep_vars (bool): Whether to keep the variable property of the + parameters. Default: False. + Returns: + dict: A dictionary containing a whole state of the module. + """ + # recursively check parallel module in case that the model has a + # complicated structure, e.g., nn.Module(nn.Module(DDP)) + if is_model_wrapper(module): + module = module.module + + # below is the same as torch.nn.Module.state_dict() + if destination is None: + destination = OrderedDict() + destination._metadata = OrderedDict() + destination._metadata[prefix[:-1]] = local_metadata = dict(version=module._version) + _save_to_state_dict(module, destination, prefix, keep_vars) + for name, child in module._modules.items(): + if child is not None: + get_state_dict(child, destination, prefix + name + ".", keep_vars=keep_vars) + for hook in module._state_dict_hooks.values(): + hook_result = hook(module, destination, prefix, local_metadata) + if hook_result is not None: + destination = hook_result + return destination + + +def save_checkpoint(model, filename, optimizer=None, meta=None): + """Save checkpoint to file. + + The checkpoint will have 3 fields: ``meta``, ``state_dict`` and + ``optimizer``. By default ``meta`` will contain version and time info. + Args: + model (Module): Module whose params are to be saved. + filename (str): Checkpoint filename. + optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. + meta (dict, optional): Metadata to be saved in checkpoint. + """ + if meta is None: + meta = {} + elif not isinstance(meta, dict): + raise TypeError(f"meta must be a dict or None, but got {type(meta)}") + meta.update(mmcv_version=mmcv.__version__, time=time.asctime()) + + if is_model_wrapper(model): + model = model.module + + if hasattr(model, "CLASSES") and model.CLASSES is not None: + # save class name to the meta + meta.update(CLASSES=model.CLASSES) + + checkpoint = {"meta": meta, "state_dict": weights_to_cpu(get_state_dict(model))} + # save optimizer state dict in the checkpoint + if isinstance(optimizer, Optimizer): + checkpoint["optimizer"] = optimizer.state_dict() + elif isinstance(optimizer, dict): + checkpoint["optimizer"] = {} + for name, optim in optimizer.items(): + checkpoint["optimizer"][name] = optim.state_dict() + + if filename.startswith("pavi://"): + try: + from pavi import modelcloud + from pavi.exception import NodeNotFoundError + except ImportError: + raise ImportError("Please install pavi to load checkpoint from modelcloud.") + model_path = filename[7:] + root = modelcloud.Folder() + model_dir, model_name = osp.split(model_path) + try: + model = modelcloud.get(model_dir) + except NodeNotFoundError: + model = root.create_training_model(model_dir) + with TemporaryDirectory() as tmp_dir: + checkpoint_file = osp.join(tmp_dir, model_name) + with open(checkpoint_file, "wb") as f: + torch.save(checkpoint, f) + f.flush() + model.create_file(checkpoint_file, name=model_name) + else: + mmcv.mkdir_or_exist(osp.dirname(filename)) + # immediately flush buffer + with open(filename, "wb") as f: + torch.save(checkpoint, f) + f.flush() + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of + residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + # x = self.drop(x) + # commit this for the original BERT implement + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + window_size=None, + attn_head_dim=None, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1 + ) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = torch.zeros( + size=(window_size[0] * window_size[1] + 1,) * 2, + dtype=relative_coords.dtype, + ) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + self.register_buffer("relative_position_index", relative_position_index) + + # trunc_normal_(self.relative_position_bias_table, std=.0) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, rel_pos_bias=None): + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat( + ( + self.q_bias, + torch.zeros_like(self.v_bias, requires_grad=False), + self.v_bias, + ) + ) + # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + #qkv: B,N,3,K,C->3,B,K,N,C + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + # attn : B,K,N,C@B,K,N,C->B,K,N,N + + if self.relative_position_bias_table is not None: + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + # relative_position_bias = relative_position_bias[:, 1:, 1:] + attn = attn + relative_position_bias.unsqueeze(0) + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + init_values=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + window_size=None, + attn_head_dim=None, + with_cp=False, + ): + super().__init__() + self.with_cp = with_cp + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + window_size=window_size, + attn_head_dim=attn_head_dim, + ) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + if init_values is not None: + self.gamma_1 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True + ) + self.gamma_2 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True + ) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, H, W, rel_pos_bias=None): + def _inner_forward(x): + if self.gamma_1 is None: + x = x + self.drop_path( + self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias) + ) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path( + self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias) + ) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + if self.with_cp and x.requires_grad: + x = cp.checkpoint(_inner_forward, x) + else: + x = _inner_forward(x) + return x + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding""" + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) + + def forward(self, x, **kwargs): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + # assert H == self.img_size[0] and W == self.img_size[1], \ + # f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x) + Hp, Wp = x.shape[2], x.shape[3] + + x = x.flatten(2).transpose(1, 2) + return x, Hp, Wp + + +class HybridEmbed(nn.Module): + """CNN Feature Map Embedding + Extract feature map from CNN, flatten, project to embedding dim. + """ + + def __init__( + self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768 + ): + super().__init__() + assert isinstance(backbone, nn.Module) + img_size = to_2tuple(img_size) + self.img_size = img_size + self.backbone = backbone + if feature_size is None: + with torch.no_grad(): + # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature + # map for all networks, the feature metadata has reliable channel and stride info, but using + # stride to calc feature dim requires info about padding of each stage that isn't captured. + training = backbone.training + if training: + backbone.eval() + o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[ + -1 + ] + feature_size = o.shape[-2:] + feature_dim = o.shape[1] + backbone.train(training) + else: + feature_size = to_2tuple(feature_size) + feature_dim = self.backbone.feature_info.channels()[-1] + self.num_patches = feature_size[0] * feature_size[1] + self.proj = nn.Linear(feature_dim, embed_dim) + + def forward(self, x): + x = self.backbone(x)[-1] + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + return x + + +class RelativePositionBias(nn.Module): + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1 + ) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = torch.zeros( + size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype + ) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + # trunc_normal_(self.relative_position_bias_table, std=.02) + + def forward(self): + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + + +# @BACKBONES.register_module() +class BEiT(nn.Module): + """Vision Transformer with support for patch or hybrid CNN input stage""" + + def __init__( + self, + img_size=512, + patch_size=16, + in_chans=3, + num_classes=80, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + hybrid_backbone=None, + norm_layer=None, + init_values=None, + use_checkpoint=False, + use_abs_pos_emb=False, + use_rel_pos_bias=True, + use_shared_rel_pos_bias=False, + pretrained=None, + with_cp=False, + ): + super().__init__() + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + self.norm_layer = norm_layer + self.num_classes = num_classes + self.num_features = ( + self.embed_dim + ) = embed_dim # num_features for consistency with other models + self.drop_path_rate = drop_path_rate + if hybrid_backbone is not None: + self.patch_embed = HybridEmbed( + hybrid_backbone, + img_size=img_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + else: + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + if use_abs_pos_emb: + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias( + window_size=self.patch_embed.patch_shape, num_heads=num_heads + ) + else: + self.rel_pos_bias = None + + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.use_rel_pos_bias = use_rel_pos_bias + self.use_checkpoint = use_checkpoint + self.blocks = nn.ModuleList( + [ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + with_cp=with_cp, + init_values=init_values, + window_size=self.patch_embed.patch_shape + if use_rel_pos_bias + else None, + ) + for i in range(depth) + ] + ) + + # if self.pos_embed is not None: + # trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=0.02) + self.apply(self._init_weights) + self.init_weights(pretrained) + + # self.fix_init_weight() + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + # pretrained = 'pretrained/beit_large_patch16_512_pt22k_ft22kto1k.pth' + if isinstance(pretrained, str): + logger = MMLogger.get_current_instance() + load_checkpoint(self, pretrained, strict=False, logger=logger) + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.mlp.fc2.weight.data, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def get_num_layers(self): + return len(self.blocks) diff --git a/cloud_adapter/models/backbones/clip.py b/cloud_adapter/models/backbones/clip.py new file mode 100644 index 0000000..7c83c1d --- /dev/null +++ b/cloud_adapter/models/backbones/clip.py @@ -0,0 +1,365 @@ +from collections import OrderedDict +import torch +import torch.nn.functional as F +from torch import nn +from timm.models.layers import drop_path, trunc_normal_ +from mmseg.models.builder import BACKBONES + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class ResidualAttentionBlock(nn.Module): + def __init__( + self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, drop_path=0.0 + ): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict( + [ + ("c_fc", nn.Linear(d_model, d_model * 4)), + ("gelu", QuickGELU()), + ("c_proj", nn.Linear(d_model * 4, d_model)), + ] + ) + ) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + def attention(self, x: torch.Tensor): + self.attn_mask = ( + self.attn_mask.to(dtype=x.dtype, device=x.device) + if self.attn_mask is not None + else None + ) + return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.drop_path(self.attention(self.ln_1(x))) + x = x + self.drop_path(self.mlp(self.ln_2(x))) + return x + + +class Transformer(nn.Module): + def __init__( + self, + width: int, + layers: int, + heads: int, + attn_mask: torch.Tensor = None, + drop_path_rate=0.0, + ): + super().__init__() + self.width = width + self.layers = layers + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, layers) + ] # stochastic depth decay rule + self.resblocks = nn.Sequential( + *[ + ResidualAttentionBlock(width, heads, attn_mask, dpr[i]) + for i in range(layers) + ] + ) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class Attention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim**-0.5 + + self.q_proj = nn.Linear(dim, dim, bias=qkv_bias) + self.k_proj = nn.Linear(dim, dim, bias=qkv_bias) + self.v_proj = nn.Linear(dim, dim, bias=qkv_bias) + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, q, k, v): + B, N, C = q.shape + assert k.shape == v.shape + B, M, C = k.shape + q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads) + k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads) + v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads) + + attn = torch.einsum("bnkc,bmkc->bknm", q, k) * self.scale + + attn = attn.softmax(dim=-1) + + x = torch.einsum("bknm,bmkc->bnkc", attn, v).reshape(B, N, C) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class TransformerDecoderLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dropout=0.1, + ): + super().__init__() + self.self_attn = Attention(d_model, nhead, proj_drop=dropout) + self.cross_attn = Attention(d_model, nhead, proj_drop=dropout) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + + self.mlp = nn.Sequential( + nn.Linear(d_model, d_model * 4), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(d_model * 4, d_model), + ) + + def forward(self, x, mem): + q = k = v = self.norm1(x) + x = x + self.self_attn(q, k, v) + q = self.norm2(x) + x = x + self.cross_attn(q, mem, mem) + x = x + self.dropout(self.mlp(self.norm3(x))) + return x + + +@BACKBONES.register_module() +class CLIPVisionTransformer(nn.Module): + def __init__( + self, + input_resolution=224, + patch_size=32, + width=768, + layers=12, + heads=12, + output_dim=512, + drop_path_rate=0.0, + out_indices=[3, 5, 7, 11], + pretrained=None, + get_embeddings=False, + **kwargs, + ): + super().__init__() + self.pretrained = pretrained + self.input_resolution = input_resolution + self.output_dim = output_dim + self.patch_size = patch_size + self.conv1 = nn.Conv2d( + in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False, + ) + + scale = width**-0.5 + self.width = width + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter( + scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width) + ) + self.spatial_size = input_resolution // patch_size + self.ln_pre = LayerNorm(width) + self.get_embeddings = get_embeddings + + self.transformer = Transformer( + width, layers, heads, drop_path_rate=drop_path_rate + ) + + self.out_indices = out_indices + + if get_embeddings: + self.ln_post = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + + embed_dim = width + if patch_size == 16: + self.fpn1 = nn.Sequential( + nn.GroupNorm(1, embed_dim), + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + nn.SyncBatchNorm(embed_dim), + nn.GELU(), + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn2 = nn.Sequential( + nn.GroupNorm(1, embed_dim), + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn3 = nn.GroupNorm(1, embed_dim) + + self.fpn4 = nn.Sequential( + nn.GroupNorm(1, embed_dim), nn.MaxPool2d(kernel_size=2, stride=2) + ) + + elif patch_size == 8: + self.fpn1 = nn.Sequential( + nn.GroupNorm(1, embed_dim), + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn2 = nn.GroupNorm(1, embed_dim) + + self.fpn3 = nn.Sequential( + nn.GroupNorm(1, embed_dim), + nn.MaxPool2d(kernel_size=2, stride=2), + ) + + self.fpn4 = nn.Sequential( + nn.GroupNorm(1, embed_dim), + nn.MaxPool2d(kernel_size=4, stride=4), + ) + + def init_weights(self, pretrained=None): + pretrained = pretrained or self.pretrained + if isinstance(pretrained, str): + checkpoint = ( + torch.jit.load(pretrained, map_location="cpu").float().state_dict() + ) + + state_dict = {} + + for k in checkpoint.keys(): + if k.startswith("visual."): + new_k = k.replace("visual.", "") + state_dict[new_k] = checkpoint[k] + + if "positional_embedding" in state_dict.keys(): + if ( + self.positional_embedding.shape + != state_dict["positional_embedding"].shape + ): + print( + f'Resize the pos_embed shape from {state_dict["positional_embedding"].shape} to {self.positional_embedding.shape}' + ) + cls_pos = state_dict["positional_embedding"][0:1, :] + leng = int(state_dict["positional_embedding"][1:,].shape[-2] ** 0.5) + spatial_pos = F.interpolate( + state_dict["positional_embedding"][1:,] + .reshape(1, leng, leng, self.width) + .permute(0, 3, 1, 2), + size=(self.spatial_size, self.spatial_size), + mode="bilinear", + ) + spatial_pos = spatial_pos.reshape( + self.width, self.spatial_size * self.spatial_size + ).permute(1, 0) + positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0) + state_dict["positional_embedding"] = positional_embedding + assert ( + self.positional_embedding.shape + == state_dict["positional_embedding"].shape + ) + conv1 = state_dict["conv1.weight"] + C_o, C_in, H, W = conv1.shape + conv1 = torch.nn.functional.interpolate( + conv1.float(), + size=(self.patch_size, self.patch_size), + mode="bicubic", + align_corners=False, + ) + state_dict["conv1.weight"] = conv1 + + u, w = self.load_state_dict(state_dict, False) + print(u, w, "are misaligned params in vision transformer") + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + B, C, H, W = x.shape + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + x = torch.cat( + [ + self.class_embedding.to(x.dtype) + + torch.zeros( + x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device + ), + x, + ], + dim=1, + ) # shape = [*, grid ** 2 + 1, width] + + pos = self.positional_embedding.to(x.dtype) + cls_pos = pos[0, :] + self.class_embedding.to(x.dtype) + spatial_pos = F.interpolate( + pos[1:,] + .reshape(1, self.spatial_size, self.spatial_size, C) + .permute(0, 3, 1, 2), + size=(H, W), + mode="bilinear", + ) + spatial_pos = spatial_pos.reshape(1, C, H * W).permute(0, 2, 1) + pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1) + x = x + pos + x = self.ln_pre(x) + x = x.permute(1, 0, 2) # NLD -> LND + + features = [] + for i, blk in enumerate(self.transformer.resblocks): + x = blk(x) + if i in self.out_indices: + xp = x.permute(1, 0, 2)[:, 1:, :].permute(0, 2, 1).reshape(B, -1, H, W) + features.append(xp.contiguous()) + + if self.get_embeddings: + x = x.permute(1, 0, 2) + x = self.ln_post(x) + x = x @ self.proj + + global_embedding = x[:, 0] + visual_embedding = ( + x[:, 1:].reshape(B, H, W, -1).permute(0, 3, 1, 2) + ) # B C H W + + features.append([global_embedding, visual_embedding]) + + return tuple(features) diff --git a/cloud_adapter/models/backbones/cloud_adapter.py b/cloud_adapter/models/backbones/cloud_adapter.py new file mode 100644 index 0000000..913ef2e --- /dev/null +++ b/cloud_adapter/models/backbones/cloud_adapter.py @@ -0,0 +1,591 @@ +import torch +from torch import nn +from einops import rearrange +from torch import nn, einsum +from einops import rearrange +from mmseg.models.builder import MODELS +import math +import torch +from torch import nn as nn +from mmseg.models.builder import MODELS +from timm.layers import DropPath, trunc_normal_ +from typing import List +from timm.layers import create_act_layer +from functools import partial +import torch.nn.functional as F + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from timm.layers import CondConv2d, get_condconv_initializer, create_conv2d, DropPath, get_norm_act_layer + + +class LoRaMLP(nn.Module): + def __init__(self, in_dim, out_dim, rank_dim=8): + super().__init__() + self.loramlp = nn.Sequential( + nn.Linear(in_dim, rank_dim, bias=False), + nn.Linear(rank_dim, out_dim, bias=False), + ) + + def forward(self, x): + return self.loramlp(x) + + +class CrossAttention(nn.Module): + def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, rank_dim=None): + super().__init__() + inner_dim = dim_head * heads # 512 + context_dim = query_dim if context_dim is None else context_dim + + self.scale = dim_head ** -0.5 + self.heads = heads + + if not rank_dim: + self.to_q = nn.Linear(query_dim, inner_dim, bias=False) + self.to_k = nn.Linear(context_dim, inner_dim, bias=False) + self.to_v = nn.Linear(context_dim, inner_dim, bias=False) + + self.to_out = nn.Linear(inner_dim, query_dim, bias=False) + else: + self.to_q = LoRaMLP(query_dim, inner_dim, rank_dim=rank_dim) + self.to_k = LoRaMLP(context_dim, inner_dim, rank_dim=rank_dim) + self.to_v = LoRaMLP(context_dim, inner_dim, rank_dim=rank_dim) + + self.to_out = LoRaMLP(inner_dim, query_dim, rank_dim=rank_dim) + + def forward(self, x, context): + h = self.heads + + q = self.to_q(x) + k = self.to_k(context) + v = self.to_v(context) + + q, k, v = map(lambda t: rearrange( + t, 'b n (h d) -> (b h) n d', h=h), (q, k, v)) + + sim = einsum('b i d, b j d -> b i j', q, k) * self.scale + + attn = sim.softmax(dim=-1) + + out = einsum('b i j, b j d -> b i d', attn, v) + out = rearrange(out, '(b h) n d -> b n (h d)', h=h) + + return self.to_out(out) + + +def num_groups(group_size, channels): + if not group_size: + return 1 + else: + assert channels % group_size == 0 + return channels // group_size + + +def _init_weight_goog(m, n='', fix_group_fanout=True): + if isinstance(m, CondConv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + if fix_group_fanout: + fan_out //= m.groups + init_weight_fn = get_condconv_initializer( + lambda w: nn.init.normal_(w, 0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape) + init_weight_fn(m.weight) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + if fix_group_fanout: + fan_out //= m.groups + nn.init.normal_(m.weight, 0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + fan_out = m.weight.size(0) + fan_in = 0 + if 'routing_fn' in n: + fan_in = m.weight.size(1) + init_range = 1.0 / math.sqrt(fan_in + fan_out) + nn.init.uniform_(m.weight, -init_range, init_range) + if m.bias is not None: + nn.init.zeros_(m.bias) + + +class DepthwiseSeparableConv(nn.Module): + def __init__( + self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='', + noskip=False, pw_kernel_size=1, pw_act=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + se_layer=None, drop_path_rate=0.): + super(DepthwiseSeparableConv, self).__init__() + norm_act_layer = get_norm_act_layer(norm_layer) + groups = num_groups(group_size, in_chs) + self.has_skip = (stride == 1 and in_chs == out_chs) and not noskip + self.has_pw_act = pw_act + + self.conv_dw = create_conv2d( + in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, groups=groups) + self.bn1 = norm_act_layer(in_chs, inplace=True) + + self.se = se_layer( + in_chs, act_layer=act_layer) if se_layer else nn.Identity() + + self.conv_pw = create_conv2d( + in_chs, out_chs, pw_kernel_size, padding=pad_type) + self.bn2 = norm_act_layer( + out_chs, inplace=True, apply_act=self.has_pw_act) + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate else nn.Identity() + + def feature_info(self, location): + if location == 'expansion': + return dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels) + else: + return dict(module='', hook_type='', num_chs=self.conv_pw.out_channels) + + def forward(self, x): + shortcut = x + x = self.conv_dw(x) + x = self.bn1(x) + x = self.se(x) + x = self.conv_pw(x) + x = self.bn2(x) + if self.has_skip: + x = self.drop_path(x) + shortcut + return x + + +class PMAAConvBlock(nn.Module): + def __init__(self, in_channels=3, hidden_channels=256, depth=4, norm=nn.BatchNorm2d, act=nn.ReLU, return_multi_feats=False, return_last_feature=True, has_stem=True, has_block=True): + super().__init__() + self.return_last_feature = return_last_feature + self.depth = depth + self.has_stem = has_stem + self.return_multi_feats = return_multi_feats + + self.proj_1x1 = DepthwiseSeparableConv( + in_channels, hidden_channels, dw_kernel_size=1, norm_layer=norm, act_layer=act) + + self.spp_dw = nn.ModuleList() + + if has_stem: + self.spp_dw.append( + DepthwiseSeparableConv(hidden_channels, hidden_channels, dw_kernel_size=3, + stride=1, group_size=hidden_channels, pad_type="same") + ) + else: + self.spp_dw.append(nn.Identity()) + + if has_block: + for _ in range(self.depth): + self.spp_dw.append( + DepthwiseSeparableConv( + hidden_channels, hidden_channels, dw_kernel_size=3, stride=2, group_size=hidden_channels + ) + ) + else: + for _ in range(self.depth): + self.spp_dw.append( + nn.MaxPool2d(kernel_size=2, stride=2) + ) + self._init_weights() + + def forward(self, x): + B, C, H, W = x.shape + output1 = self.proj_1x1(x) + output = [self.spp_dw[0](output1)] + + for k in range(1, self.depth+1): + out_k = self.spp_dw[k](output[-1]) + output.append(out_k) + + if self.return_multi_feats: + return output[1:] + else: + if self.return_last_feature: + return output[-1] + global_f = torch.zeros( + output[-1].shape, requires_grad=True, device=output1.device) + for fea in output: + global_f = global_f + F.adaptive_avg_pool2d( + fea, output_size=output[-1].shape[-2:] + ) + return global_f + + def _init_weights(self): + init_fn = _init_weight_goog + for n, m in self.named_modules(): + init_fn(m, n) + + +class ConvnextInteractiveModule(nn.Module): + def __init__(self, emd_dim=1024, context_dim=256, rank_dim=None): + super().__init__() + self.attn = CrossAttention(emd_dim, context_dim, rank_dim=rank_dim) + + def forward(self, x, cache, index): + # x: 1024 2 1024 + if isinstance(cache, list) or isinstance(cache, tuple): + # len(cache) 4 cache[4]-23 + # 0-5->0 6-11 -> 1 12-17->2 18-23->3 + cache = cache[index] + cache = F.interpolate( + cache, (int(math.sqrt(x.shape[0])), int(math.sqrt(x.shape[0]))), mode="bilinear", align_corners=False + ) + cache = cache.flatten(2) # B C N + cache = cache.permute(2, 0, 1) # N B C + + # Reshape: batch first + x = x.permute(1, 0, 2) # B N C + cache = cache.permute(1, 0, 2) # B N C + return (x + self.attn(x, cache)).permute(1, 0, 2) + + +class PMAAInteractiveModule(nn.Module): + def __init__(self, + emd_dim=1024, + context_dim=64, + kernel: int = 1, + norm=nn.BatchNorm2d, + local_groups=32, + global_groups=2, + return_multi_feats=False, + ): + super().__init__() + self.return_multi_feats = return_multi_feats + self.local_embedding = nn.Sequential( + nn.Conv2d(emd_dim, emd_dim, kernel, groups=local_groups, + padding=int((kernel - 1) / 2), bias=False), + norm(emd_dim) + ) + self.global_embedding = nn.Sequential( + nn.Conv2d(context_dim, emd_dim, kernel, groups=global_groups, + padding=int((kernel - 1) / 2), bias=False), + norm(emd_dim) + ) + self.global_act = nn.Sequential( + nn.Conv2d(context_dim, emd_dim, kernel, groups=global_groups, + padding=int((kernel - 1) / 2), bias=False), + norm(emd_dim) + ) + self.act = nn.Sigmoid() + self._init_weights() + + def _init_weights(self): + init_fn = _init_weight_goog + for n, m in self.named_modules(): + init_fn(m, n) + + def forward(self, x, cache, index): + if isinstance(cache, list) or isinstance(cache, tuple): + cache = cache[index] + N, B, C = x.shape + H = W = int(math.sqrt(N)) + # reshape x -> B, C, H, W + x = x.permute(1, 2, 0).reshape(B, C, H, W) + local_feat = self.local_embedding(x) # 32 + global_act = self.global_act(cache) + sig_act = F.interpolate(self.act(global_act), size=(H, W)) # 32 + + global_feat = self.global_embedding(cache) + global_feat = F.interpolate(global_feat, size=(H, W)) # 32 + + out = local_feat * sig_act + global_feat + + return out.permute(2, 3, 0, 1).reshape(N, B, C) + + +class LayerNorm(nn.Module): + r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with + shape (batch_size, height, width, channels) while channels_first corresponds to inputs + with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError + self.normalized_shape = (normalized_shape, ) + + def forward(self, x): + if self.data_format == "channels_last": + return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + elif self.data_format == "channels_first": + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +class Block(nn.Module): + r""" ConvNeXt Block. There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + We use (2) as we find it slightly faster in PyTorch + + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + + def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): + super().__init__() + self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, + padding=3, groups=dim) # depthwise conv + self.norm = LayerNorm(dim, eps=1e-6) + # pointwise/1x1 convs, implemented with linear layers + self.pwconv1 = nn.Linear(dim, 4 * dim) + self.act = nn.GELU() + self.pwconv2 = nn.Linear(4 * dim, dim) + self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), + requires_grad=True) if layer_scale_init_value > 0 else None + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + input = x + x = self.dwconv(x) + x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + x = input + self.drop_path(x) + return x + + +class ConvNeXt(nn.Module): + r""" ConvNeXt + A PyTorch impl of : `A ConvNet for the 2020s` - + https://arxiv.org/pdf/2201.03545.pdf + + Args: + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] + dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] + drop_path_rate (float): Stochastic depth rate. Default: 0. + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. + """ + + def __init__(self, in_chans=3, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], + drop_path_rate=0., layer_scale_init_value=1e-6, out_indices=[0, 1, 2, 3], + return_multi_feats=False, + return_last_feature=True + ): + super().__init__() + self.return_last_feature = return_last_feature + self.return_multi_feats = return_multi_feats + + # stem and 3 intermediate downsampling conv layers + self.downsample_layers = nn.ModuleList() + stem = nn.Sequential( + nn.Conv2d(in_chans, dims[0], kernel_size=2, stride=2), + LayerNorm(dims[0], eps=1e-6, data_format="channels_first") + ) + self.downsample_layers.append(stem) + for i in range(3): + downsample_layer = nn.Sequential( + LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), + nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2), + ) + self.downsample_layers.append(downsample_layer) + + # 4 feature resolution stages, each consisting of multiple residual blocks + self.stages = nn.ModuleList() + dp_rates = [x.item() + for x in torch.linspace(0, drop_path_rate, sum(depths))] + cur = 0 + for i in range(4): + stage = nn.Sequential( + *[Block(dim=dims[i], drop_path=dp_rates[cur + j], + layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])] + ) + self.stages.append(stage) + cur += depths[i] + + self.out_indices = out_indices + + norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first") + for i_layer in range(4): + layer = norm_layer(dims[i_layer]) + layer_name = f'norm{i_layer}' + self.add_module(layer_name, layer) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, (nn.Conv2d, nn.Linear)): + trunc_normal_(m.weight, std=.02) + nn.init.constant_(m.bias, 0) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + if isinstance(pretrained, str): + self.apply(_init_weights) + # logger = get_root_logger() + # load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + self.apply(_init_weights) + else: + raise TypeError('pretrained must be a str or None') + + def forward_features(self, x): + outs = [] + for i in range(4): + x = self.downsample_layers[i](x) + x = self.stages[i](x) + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + x_out = norm_layer(x) + outs.append(x_out) + if self.return_multi_feats: + return tuple(outs) + if self.return_last_feature: + return outs[-1] + global_f = torch.zeros( + outs[-1].shape, requires_grad=True, device=outs[-1].device) + for fea in outs: + global_f = global_f + F.adaptive_avg_pool2d( + fea, output_size=outs[-1].shape[-2:] + ) + return global_f + + def forward(self, x): + x = self.forward_features(x) + return x + + +class NoAdaptingModule(nn.Identity): + def __init__(self): + super().__init__() + + def forward(self, x, cache, layer): + return x + + +@MODELS.register_module() +class CloudAdapter(nn.Module): + def __init__(self, + cnn_type="convnext", # convnext or mobilenet + int_type="convnext", # cross_attention or + # 共同的参数 start + emd_dim=1024, + num_layers=24, + + # 先判断是否返回多特征,之后再判断是否进行特征融合 + return_multi_feats=True, + return_last_feature=False, + + # 共同的参数 end + + # pmaa 提取单个特征 or 多尺寸特征 start + hidden_channels=256, + depth=4, + norm=nn.BatchNorm2d, + act=nn.ReLU, + # pmaa 提取单个特征 or 多尺寸特征 end + + # pmaa net start + local_groups=1, + global_groups=1, + # pmaa net end + + # convnext 提取单个特征 or 多尺寸特征 start + context_dim=256, + rank_dim=None, + # convnext 提取单个特征 or 多尺寸特征 end, + has_stem=True, + has_block=True, + ): + super().__init__() + self.cnn = nn.Identity() + self.net = nn.Identity() + if cnn_type == "pmaa": + self.cnn = PMAAConvBlock( + hidden_channels=hidden_channels, + depth=depth, + norm=norm, + act=act, + return_multi_feats=return_multi_feats, + return_last_feature=return_last_feature, + has_stem=has_stem, + has_block=has_block + ) + elif cnn_type == "convnext": + self.cnn = ConvNeXt(depths=[1]*4, + dims=[context_dim]*4, + return_multi_feats=return_multi_feats, + return_last_feature=return_last_feature + ) + + else: + raise ValueError( + f"cnn_type must in ['convnext','pmaa'],but got {cnn_type}") + + if int_type == "convnext": + self.net = nn.ModuleList( + ConvnextInteractiveModule(emd_dim, context_dim, rank_dim) + for _ in range(num_layers) + ) + elif int_type == "pmaa": + self.net = nn.ModuleList( + PMAAInteractiveModule( + emd_dim, context_dim, local_groups=local_groups, global_groups=global_groups) + for _ in range(num_layers) + ) + + elif int_type == "no_adapting": + self.net = nn.ModuleList( + NoAdaptingModule() for _ in range(num_layers) + ) + else: + raise ValueError( + f"int_type must in ['convnext','pmaa'],but got {int_type}") + + def forward(self, feats, layer, batch_first=True, has_cls_token=True, cache=None): + if batch_first: + feats = feats.permute(1, 0, 2) # 1025 2 1024 + if has_cls_token: + cls_token, feats = torch.tensor_split(feats, [1], dim=0) + # 24 // 1 + # feat: 1024 2 1024 + feats = self.net[layer].forward( + feats, cache, layer//(len(self.net) // 4)) + + if has_cls_token: + feats = torch.cat([cls_token, feats], dim=0) + if batch_first: + feats = feats.permute(1, 0, 2) + return feats + + diff --git a/cloud_adapter/models/backbones/cloud_adapter_dinov2.py b/cloud_adapter/models/backbones/cloud_adapter_dinov2.py new file mode 100644 index 0000000..0c8fadb --- /dev/null +++ b/cloud_adapter/models/backbones/cloud_adapter_dinov2.py @@ -0,0 +1,115 @@ +from mmseg.models.builder import BACKBONES, MODELS +from torch import nn as nn +from .cloud_adapter import CloudAdapter +from .dino_v2 import DinoVisionTransformer +from .utils import set_requires_grad, set_train +import torch +import torch.nn.functional as F + + +@BACKBONES.register_module() +class CloudAdapterDinoVisionTransformer(DinoVisionTransformer): + def __init__( + self, + cloud_adapter_config=None, + has_cat=False, + # [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, ], + adapter_index=[0, 6, 12, 18], # Transformer Block 的索引 + **kwargs, + ): + super().__init__(**kwargs) + self.cloud_adapter: CloudAdapter = MODELS.build(cloud_adapter_config) + self.has_cat = has_cat + self.adapter_index = adapter_index + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + cache = self.cloud_adapter.cnn(x) # 得到多尺度特征或者单个特征 + H, W = h // self.patch_size, w // self.patch_size + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + cur_idx = 0 # 交互模块的索引 + for idx, blk in enumerate(self.blocks): + x = blk(x) + if idx in self.adapter_index: + x = self.cloud_adapter.forward( + x, + cur_idx, + batch_first=True, + has_cls_token=True, + cache=cache, + ) + cur_idx += 1 + if idx in self.out_indices: + outs.append( + x[:, 1:, :].permute(0, 2, 1).reshape( + B, -1, H, W).contiguous() + ) + return outs, cache + + def process_cache(self,ret,cache): + cache = F.interpolate( + cache,size=(ret.shape[-2],ret.shape[-1]),mode="bilinear",align_corners=False) + return cache + + def forward(self, *args, **kwargs): + ret, cache = self.forward_features(*args, **kwargs) + if isinstance(ret[0], torch.Tensor): + ret[0] = F.interpolate( + ret[0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[1] = F.interpolate( + ret[1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[3] = F.interpolate( + ret[3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + if self.has_cat: + if isinstance(cache,tuple) or isinstance(cache,list): + ret[0] = torch.cat((ret[0], cache[0]), dim=1) + ret[1] = torch.cat((ret[1], cache[1]), dim=1) + ret[2] = torch.cat((ret[2], cache[2]), dim=1) + ret[3] = torch.cat((ret[3], cache[3]), dim=1) + else: + ret[0] = torch.cat((ret[0], self.process_cache(ret[0],cache)), dim=1) + ret[1] = torch.cat((ret[1], self.process_cache(ret[1],cache)), dim=1) + ret[2] = torch.cat((ret[2], self.process_cache(ret[2],cache)), dim=1) + ret[3] = torch.cat((ret[3], self.process_cache(ret[3],cache)), dim=1) + # ret[0] = torch.cat(ret[0], cache[0], dim=1) # bs 1024 128 128, bs 256 128 128 + else: + ret[0][0] = F.interpolate( + ret[0][0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[0][1] = F.interpolate( + ret[0][1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[0][3] = F.interpolate( + ret[0][3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + if self.has_cat: + if isinstance(cache,tuple) or isinstance(cache,list): + ret[0][0] = torch.cat((ret[0][0], cache[0]), dim=1) + ret[0][1] = torch.cat((ret[0][1], cache[1]), dim=1) + ret[0][2] = torch.cat((ret[0][2], cache[2]), dim=1) + ret[0][3] = torch.cat((ret[0][3], cache[3]), dim=1) + else: + ret[0][0] = torch.cat((ret[0][0], self.process_cache(ret[0][0],cache)), dim=1) + ret[0][1] = torch.cat((ret[0][1], self.process_cache(ret[0][1],cache)), dim=1) + ret[0][2] = torch.cat((ret[0][2], self.process_cache(ret[0][2],cache)), dim=1) + ret[0][3] = torch.cat((ret[0][3], self.process_cache(ret[0][3],cache)), dim=1) + return ret + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["cloud_adapter"]) + set_train(self, ["cloud_adapter"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "cloud_adapter" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/cloud_adapter/models/backbones/cloud_adapter_sam.py b/cloud_adapter/models/backbones/cloud_adapter_sam.py new file mode 100644 index 0000000..29b0d18 --- /dev/null +++ b/cloud_adapter/models/backbones/cloud_adapter_sam.py @@ -0,0 +1,121 @@ +from mmseg.models.builder import BACKBONES, MODELS +from torch import nn as nn +from .cloud_adapter import CloudAdapter +from .sam_vit import SAMViT +from .utils import set_requires_grad, set_train +import torch +import torch.nn.functional as F + + +@BACKBONES.register_module() +class CloudAdapterSamVisionTransformer(SAMViT): + def __init__( + self, + cloud_adapter_config=None, + has_cat=False, + # [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, ], + adapter_index=[0, 6, 12, 18], # Transformer Block 的索引 + **kwargs, + ): + super().__init__(**kwargs) + self.cloud_adapter: CloudAdapter = MODELS.build(cloud_adapter_config) + self.has_cat = has_cat + self.adapter_index = adapter_index + self.embed_dim = kwargs['embed_dim'] + + def forward_features(self, x, masks=None): + cache = self.cloud_adapter.cnn(x) # 得到多尺度特征或者单个特征 + + B, C, H, W = x.shape + x = self.patch_embed(x) + Hp, Wp = H // self.patch_size, W // self.patch_size + if self.pos_embed is not None: + x = x + self.pos_embed + outs = [] + + cur_idx = 0 # 交互模块的索引 + for idx, blk in enumerate(self.blocks): + x = blk(x) + #print("x shape:",x.shape) # torch.Size([4, 32, 32, 768]) -> torch.Size([4, 1024, 768]) + if idx in self.adapter_index: + x = self.cloud_adapter.forward( + x.reshape(B,-1,self.embed_dim), + cur_idx, + batch_first=True, + has_cls_token=False, + cache=cache, + ) + x = x.reshape(B,Hp,Wp,self.embed_dim) + cur_idx += 1 + if idx in self.out_indices: + outs.append( + x.permute(0, 3, 1, 2) + ) + return outs, cache + + def process_cache(self,ret,cache): + cache = F.interpolate( + cache,size=(ret.shape[-2],ret.shape[-1]),mode="bilinear",align_corners=False) + return cache + + def forward(self, *args, **kwargs): + ret, cache = self.forward_features(*args, **kwargs) + if isinstance(ret[0], torch.Tensor): + ret[0] = F.interpolate( + ret[0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[1] = F.interpolate( + ret[1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[3] = F.interpolate( + ret[3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + if self.has_cat: + if isinstance(cache,tuple) or isinstance(cache,list): + ret[0] = torch.cat((ret[0], cache[0]), dim=1) + ret[1] = torch.cat((ret[1], cache[1]), dim=1) + ret[2] = torch.cat((ret[2], cache[2]), dim=1) + ret[3] = torch.cat((ret[3], cache[3]), dim=1) + else: + ret[0] = torch.cat((ret[0], self.process_cache(ret[0],cache)), dim=1) + ret[1] = torch.cat((ret[1], self.process_cache(ret[1],cache)), dim=1) + ret[2] = torch.cat((ret[2], self.process_cache(ret[2],cache)), dim=1) + ret[3] = torch.cat((ret[3], self.process_cache(ret[3],cache)), dim=1) + # ret[0] = torch.cat(ret[0], cache[0], dim=1) # bs 1024 128 128, bs 256 128 128 + else: + ret[0][0] = F.interpolate( + ret[0][0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[0][1] = F.interpolate( + ret[0][1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[0][3] = F.interpolate( + ret[0][3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + if self.has_cat: + if isinstance(cache,tuple) or isinstance(cache,list): + ret[0][0] = torch.cat((ret[0][0], cache[0]), dim=1) + ret[0][1] = torch.cat((ret[0][1], cache[1]), dim=1) + ret[0][2] = torch.cat((ret[0][2], cache[2]), dim=1) + ret[0][3] = torch.cat((ret[0][3], cache[3]), dim=1) + else: + ret[0][0] = torch.cat((ret[0][0], self.process_cache(ret[0][0],cache)), dim=1) + ret[0][1] = torch.cat((ret[0][1], self.process_cache(ret[0][1],cache)), dim=1) + ret[0][2] = torch.cat((ret[0][2], self.process_cache(ret[0][2],cache)), dim=1) + ret[0][3] = torch.cat((ret[0][3], self.process_cache(ret[0][3],cache)), dim=1) + return ret + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["cloud_adapter"]) + set_train(self, ["cloud_adapter"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "cloud_adapter" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/cloud_adapter/models/backbones/cnnadapter.py b/cloud_adapter/models/backbones/cnnadapter.py new file mode 100644 index 0000000..904105f --- /dev/null +++ b/cloud_adapter/models/backbones/cnnadapter.py @@ -0,0 +1,340 @@ +import torch +from torch import nn +from einops import rearrange +from torch import nn, einsum +from einops import rearrange +from mmseg.models.builder import MODELS +import math +import torch +from torch import nn as nn +from mmseg.models.builder import MODELS +from timm.layers import DropPath,trunc_normal_ +from typing import List +from timm.layers import create_act_layer +from functools import partial +import torch.nn.functional as F + + +class LayerNorm(nn.Module): + r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with + shape (batch_size, height, width, channels) while channels_first corresponds to inputs + with shape (batch_size, channels, height, width). + """ + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError + self.normalized_shape = (normalized_shape, ) + + def forward(self, x): + if self.data_format == "channels_last": + return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + elif self.data_format == "channels_first": + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +class Block(nn.Module): + r""" ConvNeXt Block. There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + We use (2) as we find it slightly faster in PyTorch + + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): + super().__init__() + self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + self.norm = LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(4 * dim, dim) + self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), + requires_grad=True) if layer_scale_init_value > 0 else None + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + input = x + x = self.dwconv(x) + x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + x = input + self.drop_path(x) + return x + + +class ConvNeXt(nn.Module): + r""" ConvNeXt + A PyTorch impl of : `A ConvNet for the 2020s` - + https://arxiv.org/pdf/2201.03545.pdf + + Args: + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] + dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] + drop_path_rate (float): Stochastic depth rate. Default: 0. + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. + """ + def __init__(self, in_chans=3, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], + drop_path_rate=0., layer_scale_init_value=1e-6, out_indices=[0, 1, 2, 3], + ): + super().__init__() + + self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers + stem = nn.Sequential( + nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), + LayerNorm(dims[0], eps=1e-6, data_format="channels_first") + ) + self.downsample_layers.append(stem) + for i in range(3): + downsample_layer = nn.Sequential( + LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), + nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2), + ) + self.downsample_layers.append(downsample_layer) + + self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks + dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] + cur = 0 + for i in range(4): + stage = nn.Sequential( + *[Block(dim=dims[i], drop_path=dp_rates[cur + j], + layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])] + ) + self.stages.append(stage) + cur += depths[i] + + self.out_indices = out_indices + + norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first") + for i_layer in range(4): + layer = norm_layer(dims[i_layer]) + layer_name = f'norm{i_layer}' + self.add_module(layer_name, layer) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, (nn.Conv2d, nn.Linear)): + trunc_normal_(m.weight, std=.02) + nn.init.constant_(m.bias, 0) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + if isinstance(pretrained, str): + self.apply(_init_weights) + # logger = get_root_logger() + # load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + self.apply(_init_weights) + else: + raise TypeError('pretrained must be a str or None') + + def forward_features(self, x): + outs = [] + for i in range(4): + x = self.downsample_layers[i](x) + x = self.stages[i](x) + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + x_out = norm_layer(x) + outs.append(x_out) + + return tuple(outs) + + def forward(self, x): + x = self.forward_features(x) + return x + + + +class InductionBias(nn.Module): + def __init__(self, in_chans=3, dim=16): + super().__init__() + self.stem = nn.Sequential( + nn.Conv2d(in_chans, dim, kernel_size=4, stride=4), + nn.BatchNorm2d(dim), + nn.ReLU(), + nn.Conv2d(dim, dim, kernel_size=4, stride=4), + nn.BatchNorm2d(dim), + nn.ReLU(), + ) + self.proj = nn.Sequential( + nn.Conv2d(dim, dim, 3, 1, 1), + nn.BatchNorm2d(dim), + nn.ReLU(), + nn.Conv2d(dim, dim, 3, 1, 1), + nn.BatchNorm2d(dim), + nn.ReLU(), + ) + + def forward(self, x): # input.shape=(bs, 3, 512, 512) output.shape=([bs, 1025, 16]) + x = self.stem(x) # x.shape=(bs, 16, 32, 32) + x = self.proj(x) # x.shape=(bs, 16, 32, 32) + x = x.flatten(2) # x.shape=(bs, 1024, 16) + return x.permute(2, 0, 1) + + +class MLP(nn.Module): + def __init__(self, in_dim, out_dim, bias): + super().__init__() + self.net = nn.Sequential( + nn.Linear(in_dim, out_dim, bias=bias), + ) + + def forward(self, x): + return self.net(x) + + +class CrossAttention(nn.Module): + def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64): + super().__init__() + inner_dim = dim_head * heads + context_dim = query_dim if context_dim is None else context_dim + + self.scale = dim_head ** -0.5 + self.heads = heads + + self.to_q = nn.Linear(query_dim, inner_dim, bias=False) + self.to_k = nn.Linear(context_dim, inner_dim, bias=False) + self.to_v = nn.Linear(context_dim, inner_dim, bias=False) + + self.to_out = nn.Linear(inner_dim, query_dim, bias=False) + + + def forward(self, x, context): + h = self.heads + + q = self.to_q(x) + k = self.to_k(context) + v = self.to_v(context) + + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v)) + + sim = einsum('b i d, b j d -> b i j', q, k) * self.scale + + attn = sim.softmax(dim=-1) + + out = einsum('b i j, b j d -> b i d', attn, v) + out = rearrange(out, '(b h) n d -> b n (h d)', h=h) + + return self.to_out(out) + + +class InteractiveModule(nn.Module): + def __init__(self, emd_dim=1024, context_dim=256): + super().__init__() + self.attn = CrossAttention(emd_dim, context_dim) + + def forward(self, x, cache, layer): + cache = cache[layer//6] + cache = F.interpolate( + cache, (int(math.sqrt(x.shape[0])), int(math.sqrt(x.shape[0]))), mode="bilinear", align_corners=False + ) + cache = cache.flatten(2) # B C N + cache = cache.permute(2, 0, 1) # N B C + + # Reshape: batch first + x = x.permute(1, 0, 2) # B N C + cache = cache.permute(1, 0, 2) # B N C + return (x + self.attn(x, cache)).permute(1, 0, 2) + +@MODELS.register_module() +class CNNAdapter(nn.Module): + def __init__(self, num_layers, emd_dim=1024, context_dim=256): + super().__init__() + self.cnn = ConvNeXt(depths=[1]*4, dims=[context_dim]*4) + self.net = nn.ModuleList( + InteractiveModule(emd_dim, context_dim) + for _ in range(num_layers) + ) + self.init_weight() + + def init_weight(self): + for m in self.net.modules(): + if isinstance(m, nn.Linear): + nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5)) + + def forward(self, feats, layer, batch_first=True, has_cls_token=True, cache=None): + if batch_first: + feats = feats.permute(1, 0, 2) + if has_cls_token: + cls_token, feats = torch.tensor_split(feats, [1], dim=0) + + feats = self.net[layer](feats, cache, layer) + + if has_cls_token: + feats = torch.cat([cls_token, feats], dim=0) + if batch_first: + feats = feats.permute(1, 0, 2) + return feats + +if __name__ == "__main__": + x = torch.randn((2, 1025, 1024)) + + convnext = ConvNeXt(in_chans=3,depths=[1]*4, dims=[64]*4) + + out = convnext(torch.randn((2,3,512,512))) + + model = CNNAdapter(4, 1024, 128) + cache = model.cnn(torch.randn((2, 3, 512, 512))) + + for feature in cache: + print(feature.shape) + + output = model(x, 0, cache=cache) + print(output.shape) + + # output, cache = model(x, 1, cache=cache) + # print(output.shape, cache.shape) + + # compute params + total_params = 0 + for param in model.parameters(): + total_params +=param.numel() + print(f"Total parameters in the model: {total_params/1e6:.2f}MB") + + + + cnn = model.cnn + inp = torch.randn(2, 3, 512, 512) + out = cnn(inp) + total_params = 0 + for param in cnn.parameters(): + total_params +=param.numel() + print(f"Total parameters in the convnext: {total_params/1e6:.2f}MB") # 0.23MB + + diff --git a/cloud_adapter/models/backbones/cnnadapter_dinov2.py b/cloud_adapter/models/backbones/cnnadapter_dinov2.py new file mode 100644 index 0000000..14441dc --- /dev/null +++ b/cloud_adapter/models/backbones/cnnadapter_dinov2.py @@ -0,0 +1,133 @@ +from mmseg.models.builder import BACKBONES, MODELS +from torch import nn as nn +from .cnnadapter import CNNAdapter +from .dino_v2 import DinoVisionTransformer +from .utils import set_requires_grad, set_train +import torch +import torch.nn.functional as F + +@BACKBONES.register_module() +class CNNAdapterDinoVisionTransformer(DinoVisionTransformer): + def __init__( + self, + cnnadapter_config=None, + has_cat = False, + cross_attention_count=-1, + num_layers=24, + **kwargs, + ): + super().__init__(**kwargs) + self.cnnadapter: CNNAdapter = MODELS.build(cnnadapter_config) + self.has_cat = has_cat + self.cross_attention_count = cross_attention_count + self.num_layers = num_layers + + def is_cross_attention(self,idx:int): + if self.cross_attention_count == -1: + return True + + if idx % (self.num_layers // self.cross_attention_count) == 0: + return True + + return False + + + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + cache = self.cnnadapter.cnn(x) # 得到多尺度特征 + H, W = h // self.patch_size, w // self.patch_size + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + cur_cross_idx = 0 + for idx, blk in enumerate(self.blocks): + x = blk(x) + if self.is_cross_attention(idx): + x = self.cnnadapter.forward( + x, + cur_cross_idx, + batch_first=True, + has_cls_token=True, + cache=cache, + ) + cur_cross_idx += 1 + if idx in self.out_indices: + outs.append( + x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, H, W).contiguous() + ) + return outs, cache + + def forward(self, *args, **kwargs): + ret, cache = self.forward_features(*args, **kwargs) + if isinstance(ret[0], torch.Tensor): + ret[0] = F.interpolate( + ret[0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[1] = F.interpolate( + ret[1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[3] = F.interpolate( + ret[3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + if self.has_cat: + ret[0] = torch.cat((ret[0], cache[0]), dim=1) + ret[1] = torch.cat((ret[1], cache[1]), dim=1) + ret[2] = torch.cat((ret[2], cache[2]), dim=1) + ret[3] = torch.cat((ret[3], cache[3]), dim=1) + # ret[0] = torch.cat(ret[0], cache[0], dim=1) # bs 1024 128 128, bs 256 128 128 + else: + ret[0][0] = F.interpolate( + ret[0][0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[0][1] = F.interpolate( + ret[0][1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[0][3] = F.interpolate( + ret[0][3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + if self.has_cat: + ret[0][0] = torch.cat((ret[0][0], cache[0]), dim=1) + ret[0][1] = torch.cat((ret[0][1], cache[1]), dim=1) + ret[0][2] = torch.cat((ret[0][2], cache[2]), dim=1) + ret[0][3] = torch.cat((ret[0][3], cache[3]), dim=1) + return ret + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["cnnadapter"]) + set_train(self, ["cnnadapter"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "cnnadapter" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state + +if __name__ == "__main__": + model = CNNAdapterDinoVisionTransformer( + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + lcnnadapter_config=dict( + type="CNNAdapter", + emd_dim=1024, + num_layers=24, + cache_dim=256, + ), + ) + + # 得到多尺度特征这里,有两个:pmaa,convnext + # 接下来,对于每个Transformer Block处理后的x ,有两种处理方法,一种是来自pmaa,一种是convnext,然后进行组合,搭配 \ No newline at end of file diff --git a/cloud_adapter/models/backbones/convnext_adapter.py b/cloud_adapter/models/backbones/convnext_adapter.py new file mode 100644 index 0000000..495f5b0 --- /dev/null +++ b/cloud_adapter/models/backbones/convnext_adapter.py @@ -0,0 +1,129 @@ +import torch +from torch import nn as nn +from mmseg.models.builder import MODELS +from timm.layers import DropPath,trunc_normal_ +from typing import List +from timm.layers import create_act_layer + +class LayerNorm(nn.Module): + def __init__(self,dim): + super().__init__() + self.norm = nn.LayerNorm(dim) + def forward(self,x): + # x : [batch channel height width] + x = x.permute(0,2,3,1) + x = self.norm(x) + # x : [batch height width channel] + x = x.permute(0,3,1,2) + return x + +class Downsample(nn.Module): + def __init__(self,input_channels,out_channels): + super().__init__() + self.norm = LayerNorm(input_channels) + self.conv = nn.Conv2d(input_channels,out_channels,kernel_size=2,stride=2) + def forward(self,x:torch.Tensor): + x = self.norm(x) + x = self.conv(x) + return x + +@MODELS.register_module() +class AdapterConvNeXtBlock(nn.Module): + def __init__( + self, + embed_dim, + rank_type="low", # low or high + rank_scale=4, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ): + super().__init__() + + self.has_conv = has_conv + self.has_proj = has_proj + + if self.has_conv: + self.conv = nn.Sequential( + LayerNorm(embed_dim), + nn.Conv2d(embed_dim, embed_dim, 7, 1, 3, groups=embed_dim), + ) + + if self.has_proj: + if rank_type == "low": + rank_dim = embed_dim // rank_scale + elif rank_type == "high": + rank_dim = embed_dim * rank_scale + else: + raise ValueError("rank_type must be low or high") + + self.proj = nn.Sequential( + LayerNorm(embed_dim), + nn.Conv2d(embed_dim, rank_dim, 1), + create_act_layer(act_layer), + nn.Conv2d(rank_dim, embed_dim, 1) + ) + + self.alpha = alpha + + self.drop_path = DropPath(drop_prob) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, (nn.Conv2d, nn.Linear)): + trunc_normal_(m.weight, std=.02) + nn.init.constant_(m.bias, 0) + + def forward(self,x:torch.Tensor,h:int=256 // 16,w:int=256 // 16): + B = x.shape[0] + cls,feature = torch.split(x,[1,x.shape[1] - 1],dim=1) + + feature = feature.permute(0, 2, 1).reshape(B, -1, h , w).contiguous() + + res = feature + + if self.has_conv: + feature = self.conv(feature) + + if self.has_proj: + feature = self.alpha * self.proj(feature) + + feature = self.drop_path(feature) + + feature = res + feature + + feature = feature.reshape(B, -1, feature.shape[1]) + + + + return torch.cat((cls,feature),dim=1) + +if __name__ == "__main__": + inp = torch.randn((2, 1025, 256)) + model = AdapterConvNeXtBlock( + embed_dim=1024, + rank_type="high", # low or high + rank_scale=4, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ) + out = model(inp) + print(model) + assert out.shape == (2, 1025, 256) + + conv_params = 0 + proj_params = 0 + for name, param in model.named_parameters(): + if "conv" in name: + conv_params += param.numel() + if "proj" in name: + proj_params += param.numel() + print(f"conv_params: {conv_params/1e6:.2f}M") + print(f"proj_params: {proj_params/1e6:.2f}M") + print(f"total_params: {(conv_params + proj_params)/1e6:.2f}M") \ No newline at end of file diff --git a/cloud_adapter/models/backbones/convnext_backbone.py b/cloud_adapter/models/backbones/convnext_backbone.py new file mode 100644 index 0000000..a07a562 --- /dev/null +++ b/cloud_adapter/models/backbones/convnext_backbone.py @@ -0,0 +1,197 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models.layers import trunc_normal_, DropPath + +# from mmseg.utils import get_root_logger +from mmseg.models.builder import BACKBONES + + +class LayerNorm(nn.Module): + r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with + shape (batch_size, height, width, channels) while channels_first corresponds to inputs + with shape (batch_size, channels, height, width). + """ + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError + self.normalized_shape = (normalized_shape, ) + + def forward(self, x): + if self.data_format == "channels_last": + return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + elif self.data_format == "channels_first": + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +class Block(nn.Module): + r""" ConvNeXt Block. There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + We use (2) as we find it slightly faster in PyTorch + + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): + super().__init__() + self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + self.norm = LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(4 * dim, dim) + self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), + requires_grad=True) if layer_scale_init_value > 0 else None + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + input = x + x = self.dwconv(x) + x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + x = input + self.drop_path(x) + return x + +@BACKBONES.register_module() +class ConvNeXt(nn.Module): + r""" ConvNeXt + A PyTorch impl of : `A ConvNet for the 2020s` - + https://arxiv.org/pdf/2201.03545.pdf + + Args: + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] + dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] + drop_path_rate (float): Stochastic depth rate. Default: 0. + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. + """ + def __init__(self, in_chans=3, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], + drop_path_rate=0., layer_scale_init_value=1e-6, out_indices=[0, 1, 2, 3], + ): + super().__init__() + + self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers + stem = nn.Sequential( + nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), + LayerNorm(dims[0], eps=1e-6, data_format="channels_first") + ) + self.downsample_layers.append(stem) + for i in range(3): + downsample_layer = nn.Sequential( + LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), + nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2), + ) + self.downsample_layers.append(downsample_layer) + + self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks + dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] + cur = 0 + for i in range(4): + stage = nn.Sequential( + *[Block(dim=dims[i], drop_path=dp_rates[cur + j], + layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])] + ) + self.stages.append(stage) + cur += depths[i] + + self.out_indices = out_indices + + norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first") + for i_layer in range(4): + layer = norm_layer(dims[i_layer]) + layer_name = f'norm{i_layer}' + self.add_module(layer_name, layer) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, (nn.Conv2d, nn.Linear)): + trunc_normal_(m.weight, std=.02) + nn.init.constant_(m.bias, 0) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + if isinstance(pretrained, str): + self.apply(_init_weights) + # logger = get_root_logger() + # load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + self.apply(_init_weights) + else: + raise TypeError('pretrained must be a str or None') + + def forward_features(self, x): + outs = [] + for i in range(4): + x = self.downsample_layers[i](x) + x = self.stages[i](x) + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + x_out = norm_layer(x) + outs.append(x_out) + + return tuple(outs) + + def forward(self, x): + x = self.forward_features(x) + return x + + +if __name__=='__main__': + input = torch.rand(1, 3, 512, 512) + model = ConvNeXt(depths=[1, 1, 1, 1], dims=[256, 256, 256, 256]) + for i in model(input): + print(i.shape) + # torch.Size([2, 256, 128, 128]) + # torch.Size([2, 256, 64, 64]) + # torch.Size([2, 256, 32, 32]) + # torch.Size([2, 256, 16, 16]) + + from thop import profile + flops, params = profile(model, inputs=(input, )) + print(f"FLOPs: {flops/1e9:.2f}G, Params: {params/1e6:.2f}M") \ No newline at end of file diff --git a/cloud_adapter/models/backbones/convnext_dinov2.py b/cloud_adapter/models/backbones/convnext_dinov2.py new file mode 100644 index 0000000..47e4d9d --- /dev/null +++ b/cloud_adapter/models/backbones/convnext_dinov2.py @@ -0,0 +1,50 @@ +from mmseg.models.builder import BACKBONES, MODELS +from torch import nn as nn +from .convnext_adapter import AdapterConvNeXtBlock +from .dino_v2 import DinoVisionTransformer +from .utils import set_requires_grad, set_train + +@BACKBONES.register_module() +class ConvnextDinoVisionTransformer(DinoVisionTransformer): + def __init__( + self, + convnext_config=None, + **kwargs, + ): + super().__init__(**kwargs) + # self.convnext: AdapterConvNeXtBlock = MODELS.build(convnext_config) + self.convnext_list = nn.ModuleList([MODELS.build(convnext_config) for _ in range(self.n_blocks-1)]) + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + H, W = h // self.patch_size, w // self.patch_size + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + if idx != len(self.blocks) - 1: + x = self.convnext_list[idx].forward( + x, + h=H, + w=W, + ) + if idx in self.out_indices: + outs.append( + x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, H, W).contiguous() + ) + return outs + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["convnext"]) + set_train(self, ["convnext"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "convnext" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state \ No newline at end of file diff --git a/cloud_adapter/models/backbones/dino_layers/__init__.py b/cloud_adapter/models/backbones/dino_layers/__init__.py new file mode 100644 index 0000000..0498f46 --- /dev/null +++ b/cloud_adapter/models/backbones/dino_layers/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +from .dino_head import DINOHead +from .mlp import Mlp +from .patch_embed import PatchEmbed +from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused +from .block import NestedTensorBlock,drop_add_residual_stochastic_depth +from .attention import MemEffAttention \ No newline at end of file diff --git a/cloud_adapter/models/backbones/dino_layers/attention.py b/cloud_adapter/models/backbones/dino_layers/attention.py new file mode 100644 index 0000000..0fb76ef --- /dev/null +++ b/cloud_adapter/models/backbones/dino_layers/attention.py @@ -0,0 +1,89 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +import logging +import os +import warnings + +from torch import Tensor +from torch import nn + + +logger = logging.getLogger("dinov2") + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +try: + if XFORMERS_ENABLED: + from xformers.ops import memory_efficient_attention, unbind + + XFORMERS_AVAILABLE = True + warnings.warn("xFormers is available (Attention)") + else: + warnings.warn("xFormers is disabled (Attention)") + raise ImportError +except ImportError: + XFORMERS_AVAILABLE = False + warnings.warn("xFormers is not available (Attention)") + + +class Attention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: Tensor) -> Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + attn = q @ k.transpose(-2, -1) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class MemEffAttention(Attention): + def forward(self, x: Tensor, attn_bias=None) -> Tensor: + if not XFORMERS_AVAILABLE: + if attn_bias is not None: + raise AssertionError("xFormers is required for using nested tensors") + return super().forward(x) + + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + + q, k, v = unbind(qkv, 2) + + x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) + x = x.reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + return x diff --git a/cloud_adapter/models/backbones/dino_layers/block.py b/cloud_adapter/models/backbones/dino_layers/block.py new file mode 100644 index 0000000..930787b --- /dev/null +++ b/cloud_adapter/models/backbones/dino_layers/block.py @@ -0,0 +1,260 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +import logging +import os +from typing import Callable, List, Any, Tuple, Dict +import warnings + +import torch +from torch import nn, Tensor + +from .attention import Attention, MemEffAttention +from .drop_path import DropPath +from .layer_scale import LayerScale +from .mlp import Mlp + + +logger = logging.getLogger("dinov2") + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +try: + if XFORMERS_ENABLED: + from xformers.ops import fmha, scaled_index_add, index_select_cat + + XFORMERS_AVAILABLE = True + warnings.warn("xFormers is available (Block)") + else: + warnings.warn("xFormers is disabled (Block)") + raise ImportError +except ImportError: + XFORMERS_AVAILABLE = False + + warnings.warn("xFormers is not available (Block)") + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_bias: bool = True, + ffn_bias: bool = True, + drop: float = 0.0, + attn_drop: float = 0.0, + init_values=None, + drop_path: float = 0.0, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = Attention, + ffn_layer: Callable[..., nn.Module] = Mlp, + ) -> None: + super().__init__() + # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}") + self.norm1 = norm_layer(dim) + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + bias=ffn_bias, + ) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.sample_drop_ratio = drop_path + + def forward(self, x: Tensor) -> Tensor: + def attn_residual_func(x: Tensor) -> Tensor: + return self.ls1(self.attn(self.norm1(x))) + + def ffn_residual_func(x: Tensor) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + if self.training and self.sample_drop_ratio > 0.1: + # the overhead is compensated only for a drop path rate larger than 0.1 + x = drop_add_residual_stochastic_depth( + x, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + x = drop_add_residual_stochastic_depth( + x, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + elif self.training and self.sample_drop_ratio > 0.0: + x = x + self.drop_path1(attn_residual_func(x)) + x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2 + else: + x = x + attn_residual_func(x) + x = x + ffn_residual_func(x) + return x + + +def drop_add_residual_stochastic_depth( + x: Tensor, + residual_func: Callable[[Tensor], Tensor], + sample_drop_ratio: float = 0.0, +) -> Tensor: + # 1) extract subset using permutation + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + x_subset = x[brange] + + # 2) apply residual_func to get residual + residual = residual_func(x_subset) + + x_flat = x.flatten(1) + residual = residual.flatten(1) + + residual_scale_factor = b / sample_subset_size + + # 3) add the residual + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + return x_plus_residual.view_as(x) + + +def get_branges_scales(x, sample_drop_ratio=0.0): + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + residual_scale_factor = b / sample_subset_size + return brange, residual_scale_factor + + +def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None): + if scaling_vector is None: + x_flat = x.flatten(1) + residual = residual.flatten(1) + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + else: + x_plus_residual = scaled_index_add( + x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor + ) + return x_plus_residual + + +attn_bias_cache: Dict[Tuple, Any] = {} + + +def get_attn_bias_and_cat(x_list, branges=None): + """ + this will perform the index select, cat the tensors, and provide the attn_bias from cache + """ + batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list] + all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) + if all_shapes not in attn_bias_cache.keys(): + seqlens = [] + for b, x in zip(batch_sizes, x_list): + for _ in range(b): + seqlens.append(x.shape[1]) + attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) + attn_bias._batch_sizes = batch_sizes + attn_bias_cache[all_shapes] = attn_bias + + if branges is not None: + cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1]) + else: + tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list) + cat_tensors = torch.cat(tensors_bs1, dim=1) + + return attn_bias_cache[all_shapes], cat_tensors + + +def drop_add_residual_stochastic_depth_list( + x_list: List[Tensor], + residual_func: Callable[[Tensor, Any], Tensor], + sample_drop_ratio: float = 0.0, + scaling_vector=None, +) -> Tensor: + # 1) generate random set of indices for dropping samples in the batch + branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list] + branges = [s[0] for s in branges_scales] + residual_scale_factors = [s[1] for s in branges_scales] + + # 2) get attention bias and index+concat the tensors + attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges) + + # 3) apply residual_func to get residual, and split the result + residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore + + outputs = [] + for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors): + outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x)) + return outputs + + +class NestedTensorBlock(Block): + def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]: + """ + x_list contains a list of tensors to nest together and run + """ + assert isinstance(self.attn, MemEffAttention) + + if self.training and self.sample_drop_ratio > 0.0: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.attn(self.norm1(x), attn_bias=attn_bias) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.mlp(self.norm2(x)) + + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None, + ) + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None, + ) + return x_list + else: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias)) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + attn_bias, x = get_attn_bias_and_cat(x_list) + x = x + attn_residual_func(x, attn_bias=attn_bias) + x = x + ffn_residual_func(x) + return attn_bias.split(x) + + def forward(self, x_or_x_list): + if isinstance(x_or_x_list, Tensor): + return super().forward(x_or_x_list) + elif isinstance(x_or_x_list, list): + if not XFORMERS_AVAILABLE: + raise AssertionError("xFormers is required for using nested tensors") + return self.forward_nested(x_or_x_list) + else: + raise AssertionError diff --git a/cloud_adapter/models/backbones/dino_layers/dino_head.py b/cloud_adapter/models/backbones/dino_layers/dino_head.py new file mode 100644 index 0000000..0ace8ff --- /dev/null +++ b/cloud_adapter/models/backbones/dino_layers/dino_head.py @@ -0,0 +1,58 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +from torch.nn.init import trunc_normal_ +from torch.nn.utils import weight_norm + + +class DINOHead(nn.Module): + def __init__( + self, + in_dim, + out_dim, + use_bn=False, + nlayers=3, + hidden_dim=2048, + bottleneck_dim=256, + mlp_bias=True, + ): + super().__init__() + nlayers = max(nlayers, 1) + self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias) + self.apply(self._init_weights) + self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False)) + self.last_layer.weight_g.data.fill_(1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.mlp(x) + eps = 1e-6 if x.dtype == torch.float16 else 1e-12 + x = nn.functional.normalize(x, dim=-1, p=2, eps=eps) + x = self.last_layer(x) + return x + + +def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True): + if nlayers == 1: + return nn.Linear(in_dim, bottleneck_dim, bias=bias) + else: + layers = [nn.Linear(in_dim, hidden_dim, bias=bias)] + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + for _ in range(nlayers - 2): + layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias)) + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias)) + return nn.Sequential(*layers) diff --git a/cloud_adapter/models/backbones/dino_layers/drop_path.py b/cloud_adapter/models/backbones/dino_layers/drop_path.py new file mode 100644 index 0000000..1d640e0 --- /dev/null +++ b/cloud_adapter/models/backbones/dino_layers/drop_path.py @@ -0,0 +1,34 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py + + +from torch import nn + + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0: + random_tensor.div_(keep_prob) + output = x * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) diff --git a/cloud_adapter/models/backbones/dino_layers/layer_scale.py b/cloud_adapter/models/backbones/dino_layers/layer_scale.py new file mode 100644 index 0000000..51df0d7 --- /dev/null +++ b/cloud_adapter/models/backbones/dino_layers/layer_scale.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 + +from typing import Union + +import torch +from torch import Tensor +from torch import nn + + +class LayerScale(nn.Module): + def __init__( + self, + dim: int, + init_values: Union[float, Tensor] = 1e-5, + inplace: bool = False, + ) -> None: + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x: Tensor) -> Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma diff --git a/cloud_adapter/models/backbones/dino_layers/mlp.py b/cloud_adapter/models/backbones/dino_layers/mlp.py new file mode 100644 index 0000000..bbf9432 --- /dev/null +++ b/cloud_adapter/models/backbones/dino_layers/mlp.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py + + +from typing import Callable, Optional + +from torch import Tensor, nn + + +class Mlp(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = nn.GELU, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) + self.drop = nn.Dropout(drop) + + def forward(self, x: Tensor) -> Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x diff --git a/cloud_adapter/models/backbones/dino_layers/patch_embed.py b/cloud_adapter/models/backbones/dino_layers/patch_embed.py new file mode 100644 index 0000000..8b7c080 --- /dev/null +++ b/cloud_adapter/models/backbones/dino_layers/patch_embed.py @@ -0,0 +1,88 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +from typing import Callable, Optional, Tuple, Union + +from torch import Tensor +import torch.nn as nn + + +def make_2tuple(x): + if isinstance(x, tuple): + assert len(x) == 2 + return x + + assert isinstance(x, int) + return (x, x) + + +class PatchEmbed(nn.Module): + """ + 2D image to patch embedding: (B,C,H,W) -> (B,N,D) + + Args: + img_size: Image size. + patch_size: Patch token size. + in_chans: Number of input image channels. + embed_dim: Number of linear projection output channels. + norm_layer: Normalization layer. + """ + + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer: Optional[Callable] = None, + flatten_embedding: bool = True, + ) -> None: + super().__init__() + + image_HW = make_2tuple(img_size) + patch_HW = make_2tuple(patch_size) + patch_grid_size = ( + image_HW[0] // patch_HW[0], + image_HW[1] // patch_HW[1], + ) + + self.img_size = image_HW + self.patch_size = patch_HW + self.patches_resolution = patch_grid_size + self.num_patches = patch_grid_size[0] * patch_grid_size[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.flatten_embedding = flatten_embedding + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + _, _, H, W = x.shape + patch_H, patch_W = self.patch_size + + assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" + assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" + + x = self.proj(x) # B C H W + H, W = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) # B HW C + x = self.norm(x) + if not self.flatten_embedding: + x = x.reshape(-1, H, W, self.embed_dim) # B H W C + return x + + def flops(self) -> float: + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops diff --git a/cloud_adapter/models/backbones/dino_layers/swiglu_ffn.py b/cloud_adapter/models/backbones/dino_layers/swiglu_ffn.py new file mode 100644 index 0000000..5e9dafa --- /dev/null +++ b/cloud_adapter/models/backbones/dino_layers/swiglu_ffn.py @@ -0,0 +1,72 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +import os +from typing import Callable, Optional +import warnings + +from torch import Tensor, nn +import torch.nn.functional as F + + +class SwiGLUFFN(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) + self.w3 = nn.Linear(hidden_features, out_features, bias=bias) + + def forward(self, x: Tensor) -> Tensor: + x12 = self.w12(x) + x1, x2 = x12.chunk(2, dim=-1) + hidden = F.silu(x1) * x2 + return self.w3(hidden) + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +try: + if XFORMERS_ENABLED: + from xformers.ops import SwiGLU + + XFORMERS_AVAILABLE = True + warnings.warn("xFormers is available (SwiGLU)") + else: + warnings.warn("xFormers is disabled (SwiGLU)") + raise ImportError +except ImportError: + SwiGLU = SwiGLUFFN + XFORMERS_AVAILABLE = False + + warnings.warn("xFormers is not available (SwiGLU)") + + +class SwiGLUFFNFused(SwiGLU): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + out_features = out_features or in_features + hidden_features = hidden_features or in_features + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + super().__init__( + in_features=in_features, + hidden_features=hidden_features, + out_features=out_features, + bias=bias, + ) diff --git a/cloud_adapter/models/backbones/dino_v2.py b/cloud_adapter/models/backbones/dino_v2.py new file mode 100644 index 0000000..59ea195 --- /dev/null +++ b/cloud_adapter/models/backbones/dino_v2.py @@ -0,0 +1,353 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +from functools import partial +import math +from typing import Sequence, Tuple, Union, Callable + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from mmseg.models.builder import BACKBONES +from mmengine.model import BaseModule +import torch.nn.functional as F +from .dino_layers import ( + Mlp, + PatchEmbed, + SwiGLUFFNFused, + MemEffAttention, + NestedTensorBlock as Block, +) + + +def named_apply( + fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False +) -> nn.Module: + if not depth_first and include_root: + fn(module=module, name=name) + for child_name, child_module in module.named_children(): + child_name = ".".join((name, child_name)) if name else child_name + named_apply( + fn=fn, + module=child_module, + name=child_name, + depth_first=depth_first, + include_root=True, + ) + if depth_first and include_root: + fn(module=module, name=name) + return module + + +class BlockChunk(nn.ModuleList): + def forward(self, x): + for b in self: + x = b(x) + return x + + +@BACKBONES.register_module() +class DinoVisionTransformer(BaseModule): + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + ffn_bias=True, + proj_bias=True, + drop_path_rate=0.0, + drop_path_uniform=False, + init_values=None, # for layerscale: None or 0 => no layerscale + embed_layer=PatchEmbed, + act_layer=nn.GELU, + block_fn=partial(Block, attn_class=MemEffAttention), + ffn_layer="mlp", + block_chunks=1, + out_indices=[7, 11, 15, 23], + init_cfg=None, + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + proj_bias (bool): enable bias for proj in attn if True + ffn_bias (bool): enable bias for ffn if True + drop_path_rate (float): stochastic depth rate + drop_path_uniform (bool): apply uniform drop rate across blocks + weight_init (str): weight init scheme + init_values (float): layer-scale init values + embed_layer (nn.Module): patch embedding layer + act_layer (nn.Module): MLP activation layer + block_fn (nn.Module): transformer block class + ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity" + block_chunks: (int) split block sequence into block_chunks units for FSDP wrap + """ + super().__init__(init_cfg) + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.out_indices = out_indices + self.drop_path_rate = drop_path_rate + self.num_features = ( + self.embed_dim + ) = embed_dim # num_features for consistency with other models + self.num_tokens = 1 + self.n_blocks = depth + self.num_heads = num_heads + self.norm_layer = norm_layer + self.patch_size = patch_size + + self.patch_embed = embed_layer( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches + self.num_tokens, embed_dim) + ) + + if drop_path_uniform is True: + dpr = [drop_path_rate] * depth + else: + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + + if ffn_layer == "mlp": + ffn_layer = Mlp + elif ffn_layer == "swiglufused" or ffn_layer == "swiglu": + ffn_layer = SwiGLUFFNFused + elif ffn_layer == "identity": + + def f(*args, **kwargs): + return nn.Identity() + + ffn_layer = f + else: + raise NotImplementedError + + blocks_list = [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + ffn_layer=ffn_layer, + init_values=init_values, + ) + for i in range(depth) + ] + if block_chunks > 0: + self.chunked_blocks = True + chunked_blocks = [] + chunksize = depth // block_chunks + for i in range(0, depth, chunksize): + # this is to keep the block index consistent if we chunk the block list + chunked_blocks.append( + [nn.Identity()] * i + blocks_list[i : i + chunksize] + ) + self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks]) + else: + self.chunked_blocks = False + self.blocks = nn.ModuleList(blocks_list) + + self.norm = norm_layer(embed_dim) + self.head = nn.Identity() + + self.mask_token = nn.Parameter(torch.zeros(1, embed_dim)) + + def interpolate_pos_encoding(self, x, w, h): + previous_dtype = x.dtype + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + if npatch == N and w == h: + return self.pos_embed + pos_embed = self.pos_embed.float() + class_pos_embed = pos_embed[:, 0] + patch_pos_embed = pos_embed[:, 1:] + dim = x.shape[-1] + w0 = w // self.patch_size + h0 = h // self.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + w0, h0 = w0 + 0.1, h0 + 0.1 + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape( + 1, int(math.sqrt(N)), int(math.sqrt(N)), dim + ).permute(0, 3, 1, 2), + scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), + mode="bicubic", + ) + + assert ( + int(w0) == patch_pos_embed.shape[-2] + and int(h0) == patch_pos_embed.shape[-1] + ) + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to( + previous_dtype + ) + + def prepare_tokens_with_masks(self, x, masks=None): + B, nc, w, h = x.shape + x = self.patch_embed(x) + if masks is not None: + x = torch.where( + masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x + ) + + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + x = x + self.interpolate_pos_encoding(x, w, h) + + return x + + def forward_features_list(self, x_list, masks_list): + x = [ + self.prepare_tokens_with_masks(x, masks) + for x, masks in zip(x_list, masks_list) + ] + for blk in self.blocks: + x = blk(x) + + all_x = x + output = [] + for x, masks in zip(all_x, masks_list): + x_norm = self.norm(x) + output.append( + { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_patchtokens": x_norm[:, 1:], + "x_prenorm": x, + "masks": masks, + } + ) + return output + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + if isinstance(x, list): + return self.forward_features_list(x, masks) + + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + if idx in self.out_indices: + outs.append( + x[:, 1:, :] + .permute(0, 2, 1) + .reshape(B, -1, h // self.patch_size, w // self.patch_size) + .contiguous() + ) + return outs + + def _get_intermediate_layers_not_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + # If n is an int, take the n last blocks. If it's a list, take them + output, total_block_len = [], len(self.blocks) + blocks_to_take = ( + range(total_block_len - n, total_block_len) if isinstance(n, int) else n + ) + for i, blk in enumerate(self.blocks): + x = blk(x) + if i in blocks_to_take: + output.append(x) + assert len(output) == len( + blocks_to_take + ), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def _get_intermediate_layers_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + output, i, total_block_len = [], 0, len(self.blocks[-1]) + # If n is an int, take the n last blocks. If it's a list, take them + blocks_to_take = ( + range(total_block_len - n, total_block_len) if isinstance(n, int) else n + ) + for block_chunk in self.blocks: + for blk in block_chunk[i:]: # Passing the nn.Identity() + x = blk(x) + if i in blocks_to_take: + output.append(x) + i += 1 + assert len(output) == len( + blocks_to_take + ), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def get_intermediate_layers( + self, + x: torch.Tensor, + n: Union[int, Sequence] = 1, # Layers or n last layers to take + reshape: bool = False, + return_class_token: bool = False, + norm=True, + ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: + if self.chunked_blocks: + outputs = self._get_intermediate_layers_chunked(x, n) + else: + outputs = self._get_intermediate_layers_not_chunked(x, n) + if norm: + outputs = [self.norm(out) for out in outputs] + class_tokens = [out[:, 0] for out in outputs] + outputs = [out[:, 1:] for out in outputs] + if reshape: + B, _, w, h = x.shape + outputs = [ + out.reshape(B, w // self.patch_size, h // self.patch_size, -1) + .permute(0, 3, 1, 2) + .contiguous() + for out in outputs + ] + if return_class_token: + return tuple(zip(outputs, class_tokens)) + return tuple(outputs) + + def forward(self, *args, **kwargs): + ret = self.forward_features(*args, **kwargs) + if isinstance(ret[0], torch.Tensor): + ret[0] = F.interpolate( + ret[0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[1] = F.interpolate( + ret[1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[3] = F.interpolate( + ret[3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + else: + ret[0][0] = F.interpolate( + ret[0][0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[0][1] = F.interpolate( + ret[0][1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[0][3] = F.interpolate( + ret[0][3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + return ret \ No newline at end of file diff --git a/cloud_adapter/models/backbones/eva_02.py b/cloud_adapter/models/backbones/eva_02.py new file mode 100644 index 0000000..6a45513 --- /dev/null +++ b/cloud_adapter/models/backbones/eva_02.py @@ -0,0 +1,849 @@ +# -------------------------------------------------------- +# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254) +# Github source: https://github.com/microsoft/unilm/tree/master/beit +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# By Hangbo Bao +# Based on timm, mmseg, setr, xcit and swin code bases +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/fudan-zvg/SETR +# https://github.com/facebookresearch/xcit/ +# https://github.com/microsoft/Swin-Transformer +# --------------------------------------------------------' + +import torch +from functools import partial +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +from timm.models.layers import drop_path, to_2tuple, trunc_normal_ + +from .beit import load_checkpoint +from mmengine.logging import MMLogger +from mmseg.models.builder import BACKBONES +from mmcv.cnn import build_norm_layer +import xformers.ops as xops +# from apex.normalization import FusedLayerNorm +# from apex.normalization import FusedLayerNorm + + +from math import pi +from einops import rearrange, repeat + + +def broadcat(tensors, dim=-1): + num_tensors = len(tensors) + shape_lens = set(list(map(lambda t: len(t.shape), tensors))) + assert len(shape_lens) == 1, "tensors must all have the same number of dimensions" + shape_len = list(shape_lens)[0] + dim = (dim + shape_len) if dim < 0 else dim + dims = list(zip(*map(lambda t: list(t.shape), tensors))) + expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim] + assert all( + [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)] + ), "invalid dimensions for broadcastable concatentation" + max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims)) + expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims)) + expanded_dims.insert(dim, (dim, dims[dim])) + expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims))) + tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes))) + return torch.cat(tensors, dim=dim) + + +def rotate_half(x): + x = rearrange(x, "... (d r) -> ... d r", r=2) + x1, x2 = x.unbind(dim=-1) + x = torch.stack((-x2, x1), dim=-1) + return rearrange(x, "... d r -> ... (d r)") + + +class VisionRotaryEmbedding(nn.Module): + def __init__( + self, + dim, + pt_seq_len, + ft_seq_len=None, + custom_freqs=None, + freqs_for="lang", + theta=10000, + max_freq=10, + num_freqs=1, + ): + super().__init__() + if custom_freqs: + freqs = custom_freqs + elif freqs_for == "lang": + freqs = 1.0 / ( + theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim) + ) + elif freqs_for == "pixel": + freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi + elif freqs_for == "constant": + freqs = torch.ones(num_freqs).float() + else: + raise ValueError(f"unknown modality {freqs_for}") + + if ft_seq_len is None: + ft_seq_len = pt_seq_len + t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len + + freqs_h = torch.einsum("..., f -> ... f", t, freqs) + freqs_h = repeat(freqs_h, "... n -> ... (n r)", r=2) + + freqs_w = torch.einsum("..., f -> ... f", t, freqs) + freqs_w = repeat(freqs_w, "... n -> ... (n r)", r=2) + + freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim=-1) + + self.register_buffer("freqs_cos", freqs.cos()) + self.register_buffer("freqs_sin", freqs.sin()) + + print("======== shape of rope freq", self.freqs_cos.shape, "========") + + def forward(self, t, start_index=0): + rot_dim = self.freqs_cos.shape[-1] + end_index = start_index + rot_dim + assert ( + rot_dim <= t.shape[-1] + ), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}" + t_left, t, t_right = ( + t[..., :start_index], + t[..., start_index:end_index], + t[..., end_index:], + ) + t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin) + return torch.cat((t_left, t, t_right), dim=-1) + + +class VisionRotaryEmbeddingFast(nn.Module): + def __init__( + self, + dim, + pt_seq_len, + ft_seq_len=None, + custom_freqs=None, + freqs_for="lang", + theta=10000, + max_freq=10, + num_freqs=1, + ): + super().__init__() + if custom_freqs: + freqs = custom_freqs + elif freqs_for == "lang": + freqs = 1.0 / ( + theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim) + ) + elif freqs_for == "pixel": + freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi + elif freqs_for == "constant": + freqs = torch.ones(num_freqs).float() + else: + raise ValueError(f"unknown modality {freqs_for}") + + if ft_seq_len is None: + ft_seq_len = pt_seq_len + t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len + + freqs = torch.einsum("..., f -> ... f", t, freqs) + freqs = repeat(freqs, "... n -> ... (n r)", r=2) + freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim=-1) + + freqs_cos = freqs.cos().view(-1, freqs.shape[-1]) + freqs_sin = freqs.sin().view(-1, freqs.shape[-1]) + + self.register_buffer("freqs_cos", freqs_cos) + self.register_buffer("freqs_sin", freqs_sin) + + def forward(self, t): + return t * self.freqs_cos + rotate_half(t) * self.freqs_sin + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + # x = self.drop(x) + # commit this for the orignal BERT implement + x = self.fc2(x) + x = self.drop(x) + return x + + +class SwiGLU(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.SiLU, + drop=0.0, + norm_layer=nn.LayerNorm, + subln=False, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.w1 = nn.Linear(in_features, hidden_features) + self.w2 = nn.Linear(in_features, hidden_features) + + self.act = act_layer() + if isinstance(norm_layer, dict): + self.ffn_ln = ( + build_norm_layer(norm_layer, hidden_features)[1] + if subln + else nn.Identity() + ) + else: + self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() + self.w3 = nn.Linear(hidden_features, out_features) + + self.drop = nn.Dropout(drop) + + def forward(self, x): + x1 = self.w1(x) + x2 = self.w2(x) + hidden = self.act(x1) * x2 + x = self.ffn_ln(hidden) + x = self.w3(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + window_size=None, + attn_head_dim=None, + subln=False, + norm_layer=nn.LayerNorm, + xattn=False, + rope=None, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim**-0.5 + + self.subln = subln + if self.subln: + self.q_proj = nn.Linear(dim, all_head_dim, bias=False) + self.k_proj = nn.Linear(dim, all_head_dim, bias=False) + self.v_proj = nn.Linear(dim, all_head_dim, bias=False) + else: + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1 + ) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = torch.zeros( + size=(window_size[0] * window_size[1] + 1,) * 2, + dtype=relative_coords.dtype, + ) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + # trunc_normal_(self.relative_position_bias_table, std=.0) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.xattn = xattn + self.rope = rope + + def forward(self, x, rel_pos_bias=None): + B, N, C = x.shape + + if self.subln: + q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias) + k = F.linear(input=x, weight=self.k_proj.weight, bias=None) + v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias) + + q = q.reshape(B, N, self.num_heads, -1).permute( + 0, 2, 1, 3 + ) # B, num_heads, N, C + k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + else: + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat( + ( + self.q_bias, + torch.zeros_like(self.v_bias, requires_grad=False), + self.v_bias, + ) + ) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute( + 2, 0, 3, 1, 4 + ) # 3, B, num_heads, N, C + q, k, v = qkv[0], qkv[1], qkv[2] + + if self.rope: + q_t = q[:, :, 1:, :] + ro_q_t = self.rope(q_t) + q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v) + + k_t = k[:, :, 1:, :] + ro_k_t = self.rope(k_t) + k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v) + + if self.xattn: + q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C + k = k.permute(0, 2, 1, 3) + v = v.permute(0, 2, 1, 3) + + x = xops.memory_efficient_attention(q, k, v) + x = x.reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + else: + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + if self.relative_position_bias_table is not None: + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + + return x + + +class Block(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + init_values=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + window_size=None, + attn_head_dim=None, + subln=False, + xattn=False, + naiveswiglu=False, + rope=None, + ): + super().__init__() + if isinstance(norm_layer, dict): + self.norm1 = build_norm_layer(norm_layer, dim)[1] + else: + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + window_size=window_size, + attn_head_dim=attn_head_dim, + subln=subln, + norm_layer=norm_layer, + xattn=xattn, + rope=rope, + ) + + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + if isinstance(norm_layer, dict): + self.norm2 = build_norm_layer(norm_layer, dim)[1] + else: + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + + if naiveswiglu: + self.mlp = SwiGLU( + in_features=dim, + hidden_features=mlp_hidden_dim, + subln=subln, + norm_layer=norm_layer, + ) + else: + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + if init_values is not None: + self.gamma_1 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True + ) + self.gamma_2 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True + ) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, rel_pos_bias=None): + if self.gamma_1 is None: + x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path( + self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias) + ) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding""" + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) + + def forward(self, x, **kwargs): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + # assert H == self.img_size[0] and W == self.img_size[1], \ + # f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x) + Hp, Wp = x.shape[2], x.shape[3] + + x = x.flatten(2).transpose(1, 2) + return x, (Hp, Wp) + + +class HybridEmbed(nn.Module): + """CNN Feature Map Embedding + Extract feature map from CNN, flatten, project to embedding dim. + """ + + def __init__( + self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768 + ): + super().__init__() + assert isinstance(backbone, nn.Module) + img_size = to_2tuple(img_size) + self.img_size = img_size + self.backbone = backbone + if feature_size is None: + with torch.no_grad(): + # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature + # map for all networks, the feature metadata has reliable channel and stride info, but using + # stride to calc feature dim requires info about padding of each stage that isn't captured. + training = backbone.training + if training: + backbone.eval() + o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[ + -1 + ] + feature_size = o.shape[-2:] + feature_dim = o.shape[1] + backbone.train(training) + else: + feature_size = to_2tuple(feature_size) + feature_dim = self.backbone.feature_info.channels()[-1] + self.num_patches = feature_size[0] * feature_size[1] + self.proj = nn.Linear(feature_dim, embed_dim) + + def forward(self, x): + x = self.backbone(x)[-1] + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + return x + + +class RelativePositionBias(nn.Module): + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1 + ) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = torch.zeros( + size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype + ) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + # trunc_normal_(self.relative_position_bias_table, std=.02) + + def forward(self): + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + + +@BACKBONES.register_module() +class EVA2(nn.Module): + """Vision Transformer with support for patch or hybrid CNN input stage""" + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=80, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4 * 2 / 3, # GLU default + qkv_bias=False, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + hybrid_backbone=None, + norm_layer=None, + init_values=None, + use_checkpoint=False, + use_abs_pos_emb=True, + use_rel_pos_bias=False, + use_shared_rel_pos_bias=False, + out_indices=[3, 5, 7, 11], + subln=True, + xattn=True, + naiveswiglu=True, + rope=True, + pt_hw_seq_len=16, + intp_freq=True, + pretrained=None, + ): + super().__init__() + # norm_layer = norm_layer or partial(FusedLayerNorm, eps=1e-6) + self.num_classes = num_classes + self.num_features = ( + self.embed_dim + ) = embed_dim # num_features for consistency with other models + + if hybrid_backbone is not None: + self.patch_embed = HybridEmbed( + hybrid_backbone, + img_size=img_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + else: + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + + num_patches = self.patch_embed.num_patches + self.out_indices = out_indices + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + + if use_abs_pos_emb: + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + else: + self.pos_embed = None + + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias( + window_size=self.patch_embed.patch_shape, num_heads=num_heads + ) + else: + self.rel_pos_bias = None + + if rope: + half_head_dim = embed_dim // num_heads // 2 + hw_seq_len = img_size // patch_size + self.rope = VisionRotaryEmbeddingFast( + dim=half_head_dim, + pt_seq_len=pt_hw_seq_len, + ft_seq_len=hw_seq_len if intp_freq else None, + ) + else: + self.rope = None + + self.naiveswiglu = naiveswiglu + + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.use_rel_pos_bias = use_rel_pos_bias + self.use_checkpoint = use_checkpoint + self.blocks = nn.ModuleList( + [ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.patch_shape + if use_rel_pos_bias + else None, + subln=subln, + xattn=xattn, + naiveswiglu=naiveswiglu, + rope=self.rope, + ) + for i in range(depth) + ] + ) + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed, std=0.02) + trunc_normal_(self.cls_token, std=0.02) + + # if patch_size == 16: + # self.fpn1 = nn.Sequential( + # nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + # nn.SyncBatchNorm(embed_dim), + # nn.GELU(), + # nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + # ) + + # self.fpn2 = nn.Sequential( + # nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + # ) + + # self.fpn3 = nn.Identity() + + # self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2) + # elif patch_size == 8: + # self.fpn1 = nn.Sequential( + # nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + # ) + + # self.fpn2 = nn.Identity() + + # self.fpn3 = nn.Sequential( + # nn.MaxPool2d(kernel_size=2, stride=2), + # ) + + # self.fpn4 = nn.Sequential( + # nn.MaxPool2d(kernel_size=4, stride=4), + # ) + # self.init_weights(pretrained) + self.pretrained = pretrained + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def init_weights(self): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + pretrained = self.pretrained + + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + if isinstance(pretrained, str): + self.apply(_init_weights) + logger = MMLogger.get_current_instance() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + self.apply(_init_weights) + else: + raise TypeError("pretrained must be a str or None") + + def get_num_layers(self): + return len(self.blocks) + + @torch.jit.ignore + def no_weight_decay(self): + return {"pos_embed", "cls_token"} + + def forward_features(self, x): + B, C, H, W = x.shape + x, (Hp, Wp) = self.patch_embed(x) + batch_size, seq_len, _ = x.size() + + cls_tokens = self.cls_token.expand( + batch_size, -1, -1 + ) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + features = [] + for i, blk in enumerate(self.blocks): + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, rel_pos_bias) + else: + x = blk(x, rel_pos_bias) + if i in self.out_indices: + xp = x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, Hp, Wp) + features.append(xp.contiguous()) + features[0] = F.interpolate( + features[0], scale_factor=4, mode="bilinear", align_corners=False + ) + features[1] = F.interpolate( + features[1], scale_factor=2, mode="bilinear", align_corners=False + ) + features[3] = F.interpolate( + features[3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + + return tuple(features) + + def forward(self, x): + x = self.forward_features(x) + return x diff --git a/cloud_adapter/models/backbones/loracacheadapter.py b/cloud_adapter/models/backbones/loracacheadapter.py new file mode 100644 index 0000000..7c90822 --- /dev/null +++ b/cloud_adapter/models/backbones/loracacheadapter.py @@ -0,0 +1,167 @@ +import torch +from torch import nn +from einops import rearrange +from torch import nn, einsum +from einops import rearrange +from mmseg.models.builder import MODELS +import math + + +class InductionBias(nn.Module): + def __init__(self, in_chans=3, dim=16): + super().__init__() + self.stem = nn.Sequential( + nn.Conv2d(in_chans, dim, kernel_size=4, stride=4), + nn.BatchNorm2d(dim), + nn.ReLU(), + nn.Conv2d(dim, dim, kernel_size=4, stride=4), + nn.BatchNorm2d(dim), + nn.ReLU(), + ) + self.proj = nn.Sequential( + nn.Conv2d(dim, dim, 3, 1, 1), + nn.BatchNorm2d(dim), + nn.ReLU(), + nn.Conv2d(dim, dim, 3, 1, 1), + nn.BatchNorm2d(dim), + nn.ReLU(), + ) + + def forward(self, x): # input.shape=(bs, 3, 512, 512) output.shape=([bs, 1025, 16]) + x = self.stem(x) # x.shape=(bs, 16, 32, 32) + x = self.proj(x) # x.shape=(bs, 16, 32, 32) + x = x.flatten(2) # x.shape=(bs, 1024, 16) + return x.permute(2, 0, 1) + + +class LoRAMLP(nn.Module): + def __init__(self, in_dim, rank_dim, out_dim, bias): + super().__init__() + self.net = nn.Sequential( + nn.Linear(in_dim, rank_dim, bias=bias), + nn.LayerNorm(rank_dim), # 添加LayerNorm + nn.Linear(rank_dim, out_dim, bias=bias), + nn.LayerNorm(out_dim) # 添加LayerNorm + ) + + def forward(self, x): + return self.net(x) + + +class LoRACrossAttention(nn.Module): + def __init__(self, query_dim, rank_dim=8, context_dim=None, heads=2, dim_head=8): + super().__init__() + inner_dim = dim_head * heads + context_dim = query_dim if context_dim is None else context_dim + + self.scale = dim_head ** -0.5 + self.heads = heads + + self.to_q = LoRAMLP(query_dim, rank_dim, inner_dim, bias=False) + self.to_k = LoRAMLP(context_dim, rank_dim, inner_dim, bias=False) + self.to_v = LoRAMLP(context_dim, rank_dim, inner_dim, bias=False) + + self.to_out = LoRAMLP(inner_dim, rank_dim, query_dim, bias=False) + + + def forward(self, x, context): + h = self.heads + + q = self.to_q(x) + k = self.to_k(context) + v = self.to_v(context) + + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v)) + + sim = einsum('b i d, b j d -> b i j', q, k) * self.scale + + attn = sim.softmax(dim=-1) + + out = einsum('b i j, b j d -> b i d', attn, v) + out = rearrange(out, '(b h) n d -> b n (h d)', h=h) + + return self.to_out(out) + + +class LoRACacheModule(nn.Module): + def __init__(self, emd_dim=1024, rank_dim=8, cache_dim=16): + super().__init__() + self.main = LoRAMLP(emd_dim, rank_dim, emd_dim, bias=True) + self.last = LoRAMLP(emd_dim, rank_dim, cache_dim, bias=True) + self.fuse = LoRAMLP(2*cache_dim, rank_dim, cache_dim, bias=True) + self.attn = LoRACrossAttention(emd_dim, rank_dim, cache_dim) + + def forward(self, x, cache=None): + x_main = self.main(x) + x_last = self.last(x) + + if cache is not None: + cache = self.fuse(torch.cat([x_last, cache], dim=-1)) + + attn_output = self.attn(x_main.permute(1, 0, 2), context=cache.permute(1, 0, 2)) + + x_main = x_main + attn_output.permute(1, 0, 2) + + if cache is None: + cache = x_last + + return x_main, cache + +@MODELS.register_module() +class LoRACacheAdapter(nn.Module): + def __init__(self, num_layers, emd_dim=1024, rank_dim=16, cache_dim=256,has_cnn=True): + super().__init__() + self.rank_dim = rank_dim + self.cnn = nn.Identity() + if has_cnn: + self.cnn = InductionBias(3, cache_dim) + self.net = nn.ModuleList( + LoRACacheModule(emd_dim, rank_dim, cache_dim) + for i in range(num_layers) + ) + self.init_weight() + + def init_weight(self): + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5)) + + def forward(self, feats, layer, batch_first=True, has_cls_token=True, cache=None): + if batch_first: + feats = feats.permute(1, 0, 2) + if has_cls_token: + cls_token, feats = torch.tensor_split(feats, [1], dim=0) + + feats, cache = self.net[layer](feats, cache) + + if has_cls_token: + feats = torch.cat([cls_token, feats], dim=0) + if batch_first: + feats = feats.permute(1, 0, 2) + return feats, cache + +if __name__ == "__main__": + x = torch.randn((2, 1025, 1024)) + model = LoRACacheAdapter(24) + output, cache = model(x, 0) + print(output.shape, cache.shape) + + # output, cache = model(x, 1, cache=cache) + # print(output.shape, cache.shape) + + # compute params + total_params = 0 + for param in model.parameters(): + total_params +=param.numel() + print(f"Total parameters in the model: {total_params/1e6:.2f}MB") + + + cnn = InductionBias() + inp = torch.randn(2, 3, 512, 512) + out = cnn(inp) + total_params = 0 + for param in cnn.parameters(): + total_params +=param.numel() + print(f"Total parameters in the model: {total_params/1e6:.2f}MB") # 0.23MB + + diff --git a/cloud_adapter/models/backbones/loracacheadapter_dinov2.py b/cloud_adapter/models/backbones/loracacheadapter_dinov2.py new file mode 100644 index 0000000..13fc506 --- /dev/null +++ b/cloud_adapter/models/backbones/loracacheadapter_dinov2.py @@ -0,0 +1,77 @@ +from mmseg.models.builder import BACKBONES, MODELS +from torch import nn as nn +from .loracacheadapter import LoRACacheAdapter +from .dino_v2 import DinoVisionTransformer +from .utils import set_requires_grad, set_train + + +@BACKBONES.register_module() +class LoRACacheAdapterDinoVisionTransformer(DinoVisionTransformer): + def __init__( + self, + loracacheadapter_config=None, + **kwargs, + ): + super().__init__(**kwargs) + self.loracacheadapter: LoRACacheAdapter = MODELS.build(loracacheadapter_config) + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + first_cache = self.loracacheadapter.cnn(x) + if isinstance(self.loracacheadapter.cnn,nn.Identity): + first_cache = None + H, W = h // self.patch_size, w // self.patch_size + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + x, cache = self.loracacheadapter.forward( + x, + idx, + batch_first=True, + has_cls_token=True, + cache=first_cache if idx == 0 else cache, + ) + if idx in self.out_indices: + outs.append( + x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, H, W).contiguous() + ) + return outs + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["loracacheadapter"]) + set_train(self, ["loracacheadapter"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "loracacheadapter" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state + +if __name__ == "__main__": + model = LoRACacheAdapterDinoVisionTransformer( + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + loracacheadapter_config=dict( + type="LoRACacheAdapter", + emd_dim=1024, + num_layers=24, + rank_dim=8, + cache_dim=16, + ), + ) diff --git a/cloud_adapter/models/backbones/my_rein_dinov2.py b/cloud_adapter/models/backbones/my_rein_dinov2.py new file mode 100644 index 0000000..5fd1cbc --- /dev/null +++ b/cloud_adapter/models/backbones/my_rein_dinov2.py @@ -0,0 +1,49 @@ +from mmseg.models.builder import BACKBONES, MODELS +from .myreins import MyReins +from .dino_v2 import DinoVisionTransformer +from .utils import set_requires_grad, set_train + + +@BACKBONES.register_module() +class MyReinsDinoVisionTransformer(DinoVisionTransformer): + def __init__( + self, + reins_config=None, + **kwargs, + ): + super().__init__(**kwargs) + self.reins: MyReins = MODELS.build(reins_config) + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + H, W = h // self.patch_size, w // self.patch_size + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + x = self.reins.forward( + x, + idx, + batch_first=True, + has_cls_token=True, + ) + if idx in self.out_indices: + outs.append( + x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, H, W).contiguous() + ) + return self.reins.return_auto(outs) + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["reins"]) + set_train(self, ["reins"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "reins" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/cloud_adapter/models/backbones/my_rein_token_mlp_dinov2.py b/cloud_adapter/models/backbones/my_rein_token_mlp_dinov2.py new file mode 100644 index 0000000..1d4747d --- /dev/null +++ b/cloud_adapter/models/backbones/my_rein_token_mlp_dinov2.py @@ -0,0 +1,49 @@ +from mmseg.models.builder import BACKBONES, MODELS +from .myrein_tonken_mlp import MyReinsTokenMlp +from .dino_v2 import DinoVisionTransformer +from .utils import set_requires_grad, set_train + + +@BACKBONES.register_module() +class MyReinTokenDinoVisionTransformer(DinoVisionTransformer): + def __init__( + self, + reins_config=None, + **kwargs, + ): + super().__init__(**kwargs) + self.reins: MyReinsTokenMlp = MODELS.build(reins_config) + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + H, W = h // self.patch_size, w // self.patch_size + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + x = self.reins.forward( + x, + idx, + batch_first=True, + has_cls_token=True, + ) + if idx in self.out_indices: + outs.append( + x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, H, W).contiguous() + ) + return self.reins.return_auto(outs) + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["reins"]) + set_train(self, ["reins"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "reins" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/cloud_adapter/models/backbones/myrein_tonken_mlp.py b/cloud_adapter/models/backbones/myrein_tonken_mlp.py new file mode 100644 index 0000000..56a7081 --- /dev/null +++ b/cloud_adapter/models/backbones/myrein_tonken_mlp.py @@ -0,0 +1,153 @@ +from mmseg.models.builder import MODELS +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from functools import reduce +from operator import mul +from torch import Tensor + + +@MODELS.register_module() +class MyReinsTokenMlp(nn.Module): + def __init__( + self, + num_layers: int, + embed_dims: int, + patch_size: int, + query_dims: int = 256, + token_length: int = 100, + use_softmax: bool = True, + link_token_to_query: bool = True, + scale_init: float = 0.001, + zero_mlp_delta_f: bool = False, + mlp_scale=4, + activate=None, + is_depend=True, + is_share=True, + high_high = False, + low_low=False + ) -> None: + super().__init__() + self.activate = activate + self.mlp_scale = mlp_scale + self.num_layers = num_layers + self.embed_dims = embed_dims + self.patch_size = patch_size + self.query_dims = query_dims + self.token_length = token_length + self.link_token_to_query = link_token_to_query + self.scale_init = scale_init + self.use_softmax = use_softmax + self.zero_mlp_delta_f = zero_mlp_delta_f + self.is_depend = is_depend + self.is_share = is_share + self.high_high = high_high + self.low_low = low_low + self.create_model() + self.init_weights() + + + def create_model(self): + val = math.sqrt( + 6.0 + / float( + 3 * reduce(mul, (self.patch_size, self.patch_size), 1) + self.embed_dims + ) + ) + activate = nn.Identity + if self.activate == "silu": + activate = nn.SiLU + ### added by zxc + if self.is_depend: + hidden_num = self.embed_dims//self.mlp_scale + if self.high_high: + hidden_num = self.embed_dims * self.mlp_scale + self.depend_mlp = nn.Parameter( + torch.empty([self.num_layers, self.token_length, self.embed_dims]) + ) + self.inverse_mlp = nn.Parameter( + torch.empty([self.num_layers, self.token_length, self.embed_dims]) + ) + nn.init.uniform_(self.depend_mlp.data, -val, val) + nn.init.uniform_(self.inverse_mlp.data, -val, val) + else: + self.depend_mlp = nn.Identity() + + if self.is_share: + hidden_num = self.embed_dims*self.mlp_scale + if self.low_low: + hidden_num = self.embed_dims//self.mlp_scale + self.shared_mlp = nn.Sequential( + nn.Linear(self.embed_dims, hidden_num), + activate(), + nn.Linear(hidden_num, self.embed_dims), + ) + else: + self.shared_mlp = nn.Identity() + ### added by zxc + + # nn.init.uniform_(self.learnable_tokens.data, -val, val) + # nn.init.kaiming_uniform_(self.mlp_delta_f.weight, a=math.sqrt(5)) + # nn.init.kaiming_uniform_(self.mlp_token2feat.weight, a=math.sqrt(5)) + self.scale = 1.0 + if self.zero_mlp_delta_f: + # del self.scale + self.scale = 1.0 + # nn.init.zeros_(self.mlp_delta_f.weight) + # nn.init.zeros_(self.mlp_delta_f.bias) + + + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5)) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def return_auto(self, feats): + return feats + + def forward( + self, feats: Tensor, layer: int, batch_first=False, has_cls_token=True + ) -> Tensor: + # B 1025 emd_dim + if batch_first: + feats = feats.permute(1, 0, 2) # 1025 B emd_dim + if has_cls_token: + cls_token, feats = torch.tensor_split(feats, [1], dim=0) # feature: 1024 B emd_dim + # tokens = self.get_tokens(layer) # length * emd_dim + delta_feat = self.forward_delta_feat( + feats, + layer, + ) + delta_feat = self.shared_mlp(delta_feat) + delta_feat = delta_feat * self.scale + feats = feats + delta_feat + if has_cls_token: + feats = torch.cat([cls_token, feats], dim=0) + if batch_first: + feats = feats.permute(1, 0, 2) + return feats + + def forward_delta_feat(self, feats: Tensor, layers: int) -> Tensor: + feat = torch.einsum("nbc,mc->nbm", feats, self.depend_mlp[layers]) + return torch.einsum("nbm,mc->nbc", feat, self.inverse_mlp[layers]) + + + + +if __name__ == "__main__": + + features = torch.randn((2,1025,1024)) + layer = 1 + rein = MyReinsTokenMlp( + token_length=100, + embed_dims=1024, + num_layers=24, + patch_size=16, + query_dims=100 + ) + + output = rein(features,1,True) + print(output.shape) \ No newline at end of file diff --git a/cloud_adapter/models/backbones/myreins.py b/cloud_adapter/models/backbones/myreins.py new file mode 100644 index 0000000..ce72a40 --- /dev/null +++ b/cloud_adapter/models/backbones/myreins.py @@ -0,0 +1,148 @@ +from mmseg.models.builder import MODELS +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from functools import reduce +from operator import mul +from timm.models.layers import trunc_normal_ +from torch import Tensor + +@MODELS.register_module() +class MyReins(nn.Module): + def __init__( + self, + num_layers: int, + embed_dims: int, + patch_size: int, + query_dims: int = 256, + token_length: int = 100, + mlp_scale=8, + ) -> None: + super().__init__() + self.mlp_scale = mlp_scale + self.num_layers = num_layers + self.embed_dims = embed_dims + self.patch_size = patch_size + self.query_dims = query_dims + self.token_length = token_length + self.create_model() + self.init_weights() + + + def create_model(self): + self.learnable_tokens = nn.Parameter( + torch.empty([self.num_layers, self.token_length, self.embed_dims]) + ) + val = math.sqrt( + 6.0 + / float( + 3 * reduce(mul, (self.patch_size, self.patch_size), 1) + self.embed_dims + ) + ) + ### added by zxc + if self.is_depend: + hidden_num = self.embed_dims//self.mlp_scale + if self.is_conv: + self.depend_mlp = nn.ModuleList([ + nn.Conv2d(self.embed_dims, self.embed_dims, 7, 1, 3, groups=self.embed_dims) for _ in range(self.num_layers)]) + else: + if self.high_high: + hidden_num = self.embed_dims * self.mlp_scale + self.depend_mlp = nn.ModuleList([ + nn.Sequential( + nn.Linear(self.embed_dims, hidden_num), + nn.Linear(hidden_num, self.embed_dims), + ) for i in range(self.num_layers)]) + else: + self.depend_mlp = nn.Identity() + + + + + if self.is_share: + hidden_num = self.embed_dims*self.mlp_scale + if self.low_low: + hidden_num = self.embed_dims//self.mlp_scale + self.shared_mlp = nn.Sequential( + nn.Linear(self.embed_dims, hidden_num), + activate(), + nn.Linear(hidden_num, self.embed_dims), + ) + else: + self.shared_mlp = nn.Identity() + ### added by zxc + + # nn.init.uniform_(self.learnable_tokens.data, -val, val) + # nn.init.kaiming_uniform_(self.mlp_delta_f.weight, a=math.sqrt(5)) + # nn.init.kaiming_uniform_(self.mlp_token2feat.weight, a=math.sqrt(5)) + self.transform = nn.Linear(self.embed_dims, self.query_dims) + self.merge = nn.Linear(self.query_dims * 3, self.query_dims) + self.scale = 1.0 + if self.zero_mlp_delta_f: + # del self.scale + self.scale = 1.0 + # nn.init.zeros_(self.mlp_delta_f.weight) + # nn.init.zeros_(self.mlp_delta_f.bias) + + + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5)) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m,nn.Conv2d): + trunc_normal_(m.weight, std=.02) + nn.init.constant_(m.bias, 0) + + + def forward( + self, feats: Tensor, layer: int, batch_first=False, has_cls_token=True + ) -> Tensor: + # B 1025 emd_dim + if batch_first: + feats = feats.permute(1, 0, 2) # 1025 B emd_dim + if has_cls_token: + cls_token, feats = torch.tensor_split(feats, [1], dim=0) # feature: 1024 B emd_dim + + + if self.is_depend: + tokens = self.depend_mlp[layer] + else: + tokens = self.depend_mlp + delta_feat = self.forward_delta_feat( + feats, + tokens, + layer, + ) + delta_feat = self.shared_mlp(delta_feat) + + feats = feats + delta_feat + if has_cls_token: + feats = torch.cat([cls_token, feats], dim=0) + if batch_first: + feats = feats.permute(1, 0, 2) + return feats + + def forward_delta_feat(self, feats: Tensor, tokens: Tensor, layers: int) -> Tensor: + return tokens(feats) + + + + +if __name__ == "__main__": + + features = torch.randn((2,1025,1024)) + layer = 1 + rein = MyReins( + token_length=100, + embed_dims=1024, + num_layers=24, + patch_size=16, + query_dims=100, + is_conv=True + ) + + output = rein(features,1,True) + print(output.shape) \ No newline at end of file diff --git a/cloud_adapter/models/backbones/myreinstoken.py b/cloud_adapter/models/backbones/myreinstoken.py new file mode 100644 index 0000000..ae3d9af --- /dev/null +++ b/cloud_adapter/models/backbones/myreinstoken.py @@ -0,0 +1,152 @@ +from mmseg.models.builder import MODELS +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from functools import reduce +from operator import mul +from torch import Tensor +from thop import profile,clever_format + +class TokenLayer(nn.Module): + def __init__( + self, + in_dim, + out_dim, + rank=8, + alpha=16, + ) -> None: + super().__init__() + std_dev = 1/torch.sqrt(torch.tensor(rank).float()) + self.A = torch.nn.Parameter(torch.randn(in_dim, rank)*std_dev) + self.B = torch.nn.Parameter(torch.zeros(rank, out_dim)) + self.alpha = alpha + + def forward(self, x): + return self.alpha*(x@self.A@self.B) + + +@MODELS.register_module() +class MyReinsToken(nn.Module): + def __init__( + self, + num_layers: int, + embed_dims: int, + patch_size: int, + query_dims: int = 256, + token_length: int = 100, + use_softmax: bool = True, + link_token_to_query: bool = True, + scale_init: float = 0.001, + zero_mlp_delta_f: bool = False, + mlp_scale=4, + rank=8, + alpha=16, + activate=None, + ) -> None: + super().__init__() + self.rank = rank + self.alpha = alpha + self.activate = activate + self.mlp_scale = mlp_scale + self.num_layers = num_layers + self.embed_dims = embed_dims + self.patch_size = patch_size + self.query_dims = query_dims + self.token_length = token_length + self.link_token_to_query = link_token_to_query + self.scale_init = scale_init + self.use_softmax = use_softmax + self.zero_mlp_delta_f = zero_mlp_delta_f + self.scale = 1.0 + self.create_model() + + + def create_model(self): + std_dev = 1/torch.sqrt(torch.tensor(self.rank).float()) + + self.tokens = nn.Parameter( + torch.empty([self.num_layers, self.embed_dims,self.token_length]) + ) + self.inverse_token = nn.Parameter( + torch.empty([self.num_layers, self.token_length, self.embed_dims]) + ) + val = math.sqrt( + 6.0 + / float( + 3 * reduce(mul, (self.patch_size, self.patch_size), 1) + self.embed_dims + ) + ) + nn.init.uniform_(self.tokens.data, -val, val) + nn.init.uniform_(self.inverse_token.data, -val, val) + + # self.tokens = nn.Parameter( + # torch.randn(self.num_layers,self.embed_dims, self.rank)*std_dev + # ) + + self.C = torch.nn.Parameter(torch.randn(self.embed_dims,self.rank * self.embed_dims)*std_dev) + self.D = torch.nn.Parameter(torch.zeros(self.rank * self.embed_dims, self.embed_dims)) + + def return_auto(self, feats): + return feats + + def get_tokens(self, layer: int) -> Tensor: + return self.tokens[layer] + + def forward( + self, feats: Tensor, layer: int, batch_first=False, has_cls_token=True + ) -> Tensor: + # B 1025 emd_dim + if batch_first: + feats = feats.permute(1, 0, 2) # 1025 B emd_dim + if has_cls_token: + cls_token, feats = torch.tensor_split(feats, [1], dim=0) # feature: 1024 B emd_dim + + + + + # tokens = self.get_tokens(layer) # length * emd_dim + # delta_feat = self.forward_delta_feat( + # feats, + # tokens, + # layer, + # ) + delta_feat = self.alpha * (feats @ self.tokens[layer] @ self.inverse_token[layer]) + delta_feat = self.alpha*(delta_feat@self.C@self.D) + # delta_feat = self.shared_mlp(delta_feat) + delta_feat = delta_feat * self.scale + feats = feats + delta_feat + + + + + if has_cls_token: + feats = torch.cat([cls_token, feats], dim=0) + if batch_first: + feats = feats.permute(1, 0, 2) + return feats + + # def forward_delta_feat(self, feats: Tensor, tokens: Tensor, layers: int) -> Tensor: + # attn = torch.einsum("nbc,mc->nbm", feats, tokens) + + # return torch.einsum("nbm,mc->nbc", attn, self.inverse_token[layers]) + + + + +if __name__ == "__main__": + + features = torch.randn((2,1025,1024)) + layer = 1 + rein = MyReinsToken( + token_length=50, + embed_dims=1024, + num_layers=24, + patch_size=16, + query_dims=100 + ) + # print(rein) + output = rein(features,1,True) + print(output.shape) + params = sum(p.numel() for p in rein.parameters() if p.requires_grad) + print(f"Backbone trainable parameters: {params / 1e6:.2f}M") \ No newline at end of file diff --git a/cloud_adapter/models/backbones/pmaa.py b/cloud_adapter/models/backbones/pmaa.py new file mode 100644 index 0000000..56a0b43 --- /dev/null +++ b/cloud_adapter/models/backbones/pmaa.py @@ -0,0 +1,163 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from timm.layers import CondConv2d, get_condconv_initializer, create_conv2d, DropPath, get_norm_act_layer + + +def num_groups(group_size, channels): + if not group_size: + return 1 + else: + assert channels % group_size == 0 + return channels // group_size + + +def _init_weight_goog(m, n='', fix_group_fanout=True): + if isinstance(m, CondConv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + if fix_group_fanout: + fan_out //= m.groups + init_weight_fn = get_condconv_initializer( + lambda w: nn.init.normal_(w, 0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape) + init_weight_fn(m.weight) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + if fix_group_fanout: + fan_out //= m.groups + nn.init.normal_(m.weight, 0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + fan_out = m.weight.size(0) + fan_in = 0 + if 'routing_fn' in n: + fan_in = m.weight.size(1) + init_range = 1.0 / math.sqrt(fan_in + fan_out) + nn.init.uniform_(m.weight, -init_range, init_range) + if m.bias is not None: + nn.init.zeros_(m.bias) + + +class DepthwiseSeparableConv(nn.Module): + def __init__( + self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='', + noskip=False, pw_kernel_size=1, pw_act=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + se_layer=None, drop_path_rate=0.): + super(DepthwiseSeparableConv, self).__init__() + norm_act_layer = get_norm_act_layer(norm_layer) + groups = num_groups(group_size, in_chs) + self.has_skip = (stride == 1 and in_chs == out_chs) and not noskip + self.has_pw_act = pw_act + + self.conv_dw = create_conv2d( + in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, groups=groups) + self.bn1 = norm_act_layer(in_chs, inplace=True) + + self.se = se_layer( + in_chs, act_layer=act_layer) if se_layer else nn.Identity() + + self.conv_pw = create_conv2d( + in_chs, out_chs, pw_kernel_size, padding=pad_type) + self.bn2 = norm_act_layer( + out_chs, inplace=True, apply_act=self.has_pw_act) + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate else nn.Identity() + + def feature_info(self, location): + if location == 'expansion': + return dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels) + else: + return dict(module='', hook_type='', num_chs=self.conv_pw.out_channels) + + def forward(self, x): + shortcut = x + x = self.conv_dw(x) + x = self.bn1(x) + x = self.se(x) + x = self.conv_pw(x) + x = self.bn2(x) + if self.has_skip: + x = self.drop_path(x) + shortcut + return x + + + + +class PMAAConvBlock(nn.Module): + def __init__(self, in_channels=3, hidden_channels=256, depth=4, norm=nn.BatchNorm2d, act=nn.ReLU, return_multi_feats=False): + super().__init__() + self.depth = depth + self.return_multi_feats=return_multi_feats + + self.proj_1x1 = DepthwiseSeparableConv(in_channels, hidden_channels, dw_kernel_size=1, norm_layer=norm, act_layer=act) + + self.spp_dw = nn.ModuleList() + + self.spp_dw.append( + DepthwiseSeparableConv(hidden_channels, hidden_channels, dw_kernel_size=3, stride=1, group_size=hidden_channels, pad_type="same") + ) + + for _ in range(self.depth): + self.spp_dw.append( + DepthwiseSeparableConv( + hidden_channels, hidden_channels, dw_kernel_size=3, stride=2, group_size=hidden_channels + ) + ) # 256 128 64 32 + + self._init_weights() + + def forward(self, x): + B, C, H, W = x.shape + + output1 = self.proj_1x1(x) + output = [self.spp_dw[0](output1)] + + for k in range(1, self.depth+1): + out_k = self.spp_dw[k](output[-1]) + output.append(out_k) + + + if self.return_multi_feats: + return output + else: + global_f = torch.zeros(output[-1].shape, requires_grad=True, device=output1.device) + for fea in output: + global_f = global_f + F.adaptive_avg_pool2d( + fea, output_size=output[-1].shape[-2:] + ) + return global_f + + def _init_weights(self): + init_fn = _init_weight_goog + for n, m in self.named_modules(): + init_fn(m, n) + + +class PMAA(nn.Module): + def __init__(self, hidden_channels=256, depth=4, norm=nn.BatchNorm2d, act=nn.ReLU, return_multi_feats=False) -> None: + super(PMAA, self).__init__() + self.net= PMAAConvBlock(3, hidden_channels, depth=depth, norm=norm, act=act, return_multi_feats=return_multi_feats) + + + def forward(self, x): + return self.net(x) + + +if __name__ == '__main__': + inp = torch.randn(1, 3, 512, 512) + net = PMAA(64) + out = net(inp) + for i in out: + print(i.shape) + + # compute macs (Gflops) and params (MB) + from thop import profile + macs, params = profile(net, inputs=(inp, )) + print(f"macs: {macs/1e9:.2f} Gflops") + print(f"params: {params/1e6:.2f} MB") \ No newline at end of file diff --git a/cloud_adapter/models/backbones/pmaaadapter.py b/cloud_adapter/models/backbones/pmaaadapter.py new file mode 100644 index 0000000..b395ca2 --- /dev/null +++ b/cloud_adapter/models/backbones/pmaaadapter.py @@ -0,0 +1,275 @@ +import torch +from torch import nn +from einops import rearrange +from torch import nn, einsum +from einops import rearrange +from mmseg.models.builder import MODELS +import math +import torch +from torch import nn as nn +from mmseg.models.builder import MODELS +from timm.layers import DropPath,trunc_normal_ +from typing import List +from timm.layers import create_act_layer +from functools import partial +import torch.nn.functional as F + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from timm.layers import CondConv2d, get_condconv_initializer, create_conv2d, DropPath, get_norm_act_layer + + +def num_groups(group_size, channels): + if not group_size: + return 1 + else: + assert channels % group_size == 0 + return channels // group_size + + +def _init_weight_goog(m, n='', fix_group_fanout=True): + if isinstance(m, CondConv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + if fix_group_fanout: + fan_out //= m.groups + init_weight_fn = get_condconv_initializer( + lambda w: nn.init.normal_(w, 0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape) + init_weight_fn(m.weight) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + if fix_group_fanout: + fan_out //= m.groups + nn.init.normal_(m.weight, 0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + fan_out = m.weight.size(0) + fan_in = 0 + if 'routing_fn' in n: + fan_in = m.weight.size(1) + init_range = 1.0 / math.sqrt(fan_in + fan_out) + nn.init.uniform_(m.weight, -init_range, init_range) + if m.bias is not None: + nn.init.zeros_(m.bias) + + +class DepthwiseSeparableConv(nn.Module): + def __init__( + self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='', + noskip=False, pw_kernel_size=1, pw_act=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + se_layer=None, drop_path_rate=0.): + super(DepthwiseSeparableConv, self).__init__() + norm_act_layer = get_norm_act_layer(norm_layer) + groups = num_groups(group_size, in_chs) + self.has_skip = (stride == 1 and in_chs == out_chs) and not noskip + self.has_pw_act = pw_act + + self.conv_dw = create_conv2d( + in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, groups=groups) + self.bn1 = norm_act_layer(in_chs, inplace=True) + + self.se = se_layer( + in_chs, act_layer=act_layer) if se_layer else nn.Identity() + + self.conv_pw = create_conv2d( + in_chs, out_chs, pw_kernel_size, padding=pad_type) + self.bn2 = norm_act_layer( + out_chs, inplace=True, apply_act=self.has_pw_act) + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate else nn.Identity() + + def feature_info(self, location): + if location == 'expansion': + return dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels) + else: + return dict(module='', hook_type='', num_chs=self.conv_pw.out_channels) + + def forward(self, x): + shortcut = x + x = self.conv_dw(x) + x = self.bn1(x) + x = self.se(x) + x = self.conv_pw(x) + x = self.bn2(x) + if self.has_skip: + x = self.drop_path(x) + shortcut + return x + + + + +class PMAAConvBlock(nn.Module): + def __init__(self, in_channels=3, hidden_channels=256, depth=4, norm=nn.BatchNorm2d, act=nn.ReLU, return_multi_feats=False): + super().__init__() + self.depth = depth + self.return_multi_feats=return_multi_feats + + self.proj_1x1 = DepthwiseSeparableConv(in_channels, hidden_channels, dw_kernel_size=1, norm_layer=norm, act_layer=act) + + self.spp_dw = nn.ModuleList() + + self.spp_dw.append( + DepthwiseSeparableConv(hidden_channels, hidden_channels, dw_kernel_size=3, stride=1, group_size=hidden_channels, pad_type="same") + ) + + for _ in range(self.depth): + self.spp_dw.append( + DepthwiseSeparableConv( + hidden_channels, hidden_channels, dw_kernel_size=3, stride=2, group_size=hidden_channels + ) + ) + + self._init_weights() + + def forward(self, x): + B, C, H, W = x.shape + output1 = self.proj_1x1(x) + output = [self.spp_dw[0](output1)] + + for k in range(1, self.depth+1): + out_k = self.spp_dw[k](output[-1]) + output.append(out_k) + + + if self.return_multi_feats: + return output + else: + global_f = torch.zeros(output[-1].shape, requires_grad=True, device=output1.device) + for fea in output: + global_f = global_f + F.adaptive_avg_pool2d( + fea, output_size=output[-1].shape[-2:] + ) + return global_f + + def _init_weights(self): + init_fn = _init_weight_goog + for n, m in self.named_modules(): + init_fn(m, n) + + +class PMAA(nn.Module): + def __init__(self, hidden_channels=256, depth=4, norm=nn.BatchNorm2d, act=nn.ReLU, return_multi_feats=True) -> None: + super(PMAA, self).__init__() + self.net= PMAAConvBlock(3, hidden_channels, depth=depth, norm=norm, act=act, return_multi_feats=return_multi_feats) + + + def forward(self, x): + return self.net(x) + + +class InteractiveModule(nn.Module): + def __init__(self, emd_dim=1024, context_dim=64, kernel: int = 1, norm=nn.BatchNorm2d, local_groups=32, global_groups=2): + super().__init__() + self.local_embedding = nn.Sequential( + nn.Conv2d(emd_dim, emd_dim, kernel, groups=local_groups, + padding=int((kernel - 1) / 2), bias=False), + norm(emd_dim) + ) + self.global_embedding = nn.Sequential( + nn.Conv2d(context_dim, emd_dim, kernel, groups=global_groups, + padding=int((kernel - 1) / 2), bias=False), + norm(emd_dim) + ) + self.global_act = nn.Sequential( + nn.Conv2d(context_dim, emd_dim, kernel, groups=global_groups, + padding=int((kernel - 1) / 2), bias=False), + norm(emd_dim) + ) + self.act = nn.Sigmoid() + self._init_weights() + + def _init_weights(self): + init_fn = _init_weight_goog + for n, m in self.named_modules(): + init_fn(m, n) + + def forward(self, x, cache, layer): + + N, B, C = x.shape + H=W=int(math.sqrt(N)) + # reshape x -> B, C, H, W + x = x.permute(1, 2, 0).reshape(B, C, H, W) + local_feat = self.local_embedding(x) + + global_act = self.global_act(cache) + sig_act = F.interpolate(self.act(global_act), size=(H, W)) + + global_feat = self.global_embedding(cache) + global_feat = F.interpolate(global_feat, size=(H, W)) + + out = local_feat * sig_act + global_feat + + return out.permute(2, 3, 0, 1).reshape(N, B, C) + +@MODELS.register_module() +class PMAAAdapter(nn.Module): + def __init__(self, num_layers, emd_dim=1024, context_dim=64, local_groups=32, global_groups=2): + super().__init__() + self.pmaa = PMAA(context_dim) + self.net = nn.ModuleList( + InteractiveModule(emd_dim, context_dim, local_groups=local_groups, global_groups=global_groups) + for _ in range(num_layers) + ) + self.init_weight() + + def init_weight(self): + for m in self.net.modules(): + if isinstance(m, nn.Linear): + nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5)) + + def forward(self, feats, layer, batch_first=True, has_cls_token=True, cache=None): + if batch_first: + feats = feats.permute(1, 0, 2) + if has_cls_token: + cls_token, feats = torch.tensor_split(feats, [1], dim=0) + + feats = self.net[layer](feats, cache, layer) + + if has_cls_token: + feats = torch.cat([cls_token, feats], dim=0) + if batch_first: + feats = feats.permute(1, 0, 2) + return feats + +if __name__ == "__main__": + + x = torch.randn((1, 1025, 1024)) + model = PMAAAdapter(24, 1024, 64, local_groups=32, global_groups=2) # + cache = model.pmaa(torch.randn((1, 3, 512, 512))) + # print(cache.shape) + for feature in cache: + print(feature.shape) + + exit(0) + output = model(x, 0, cache=cache) + + + # output, cache = model(x, 1, cache=cache) + # print(output.shape, cache.shape) + + # compute params (Mb) of the total model + params= sum(p.numel() for p in model.parameters()) / 1e6 + print(f"Total params: {params:.2f} Mb") + + # # compute params (Mb) of the pmaa model + params= sum(p.numel() for p in model.pmaa.parameters()) / 1e6 + print(f"PMAA params: {params:.2f} Mb") + + # compute macs (Gflps) and params (Mb) of the total model + # from thop import profile + # macs, params = profile(model, inputs=((x, 0, True, True, cache)), verbose=False) + # print(f"Total macs: {macs / 1e9:.2f} Gflps, params: {params / 1e6:.2f} Mb") + + # # compute macs (Gflps) and params (Mb) of the pmaa model + # macs, params = profile(model.pmaa, inputs=(torch.randn(1, 3, 512, 512),), verbose=False) + # print(f"PMAA macs: {macs / 1e9:.2f} Gflps, params: {params / 1e6:.2f} Mb") + + # diff --git a/cloud_adapter/models/backbones/pmaaadapter_dinov2.py b/cloud_adapter/models/backbones/pmaaadapter_dinov2.py new file mode 100644 index 0000000..ff1eb5d --- /dev/null +++ b/cloud_adapter/models/backbones/pmaaadapter_dinov2.py @@ -0,0 +1,92 @@ +from mmseg.models.builder import BACKBONES, MODELS +from torch import nn as nn +from .pmaaadapter import PMAAAdapter +from .dino_v2 import DinoVisionTransformer +from .utils import set_requires_grad, set_train +import torch +import torch.nn.functional as F + +@BACKBONES.register_module() +class PMAAAdapterDinoVisionTransformer(DinoVisionTransformer): + def __init__( + self, + pmaa_adapter_config=None, + has_cat = False, + **kwargs, + ): + super().__init__(**kwargs) + self.pmaa_adapter: PMAAAdapter = MODELS.build(pmaa_adapter_config) + self.has_cat = has_cat + + + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + cache = self.pmaa_adapter.pmaa(x) # 得到多尺度特征 + H, W = h // self.patch_size, w // self.patch_size + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + x = self.pmaa_adapter.forward( + x, + idx, + batch_first=True, + has_cls_token=True, + cache=cache, + ) + if idx in self.out_indices: + outs.append( + x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, H, W).contiguous() + ) + return outs, cache + + def forward(self, *args, **kwargs): + ret, cache = self.forward_features(*args, **kwargs) + if isinstance(ret[0], torch.Tensor): + ret[0] = F.interpolate( + ret[0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[1] = F.interpolate( + ret[1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[3] = F.interpolate( + ret[3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + if self.has_cat: + ret[0] = torch.cat((ret[0], cache[0]), dim=1) + ret[1] = torch.cat((ret[1], cache[1]), dim=1) + ret[2] = torch.cat((ret[2], cache[2]), dim=1) + ret[3] = torch.cat((ret[3], cache[3]), dim=1) + # ret[0] = torch.cat(ret[0], cache[0], dim=1) # bs 1024 128 128, bs 256 128 128 + else: + ret[0][0] = F.interpolate( + ret[0][0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[0][1] = F.interpolate( + ret[0][1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[0][3] = F.interpolate( + ret[0][3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + if self.has_cat: + ret[0][0] = torch.cat((ret[0][0], cache[0]), dim=1) + ret[0][1] = torch.cat((ret[0][1], cache[1]), dim=1) + ret[0][2] = torch.cat((ret[0][2], cache[2]), dim=1) + ret[0][3] = torch.cat((ret[0][3], cache[3]), dim=1) + return ret + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["pmaa_adapter"]) + set_train(self, ["pmaa_adapter"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "pmaa_adapter" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state \ No newline at end of file diff --git a/cloud_adapter/models/backbones/rein_token_divo2.py b/cloud_adapter/models/backbones/rein_token_divo2.py new file mode 100644 index 0000000..514638c --- /dev/null +++ b/cloud_adapter/models/backbones/rein_token_divo2.py @@ -0,0 +1,49 @@ +from mmseg.models.builder import BACKBONES, MODELS +from .myreinstoken import MyReinsToken +from .dino_v2 import DinoVisionTransformer +from .utils import set_requires_grad, set_train + + +@BACKBONES.register_module() +class ReinsTokenDinoVisionTransformer(DinoVisionTransformer): + def __init__( + self, + reins_config=None, + **kwargs, + ): + super().__init__(**kwargs) + self.reins: MyReinsToken = MODELS.build(reins_config) + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + H, W = h // self.patch_size, w // self.patch_size + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + x = self.reins.forward( + x, + idx, + batch_first=True, + has_cls_token=True, + ) + if idx in self.out_indices: + outs.append( + x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, H, W).contiguous() + ) + return self.reins.return_auto(outs) + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["reins"]) + set_train(self, ["reins"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "reins" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/cloud_adapter/models/backbones/reins.py b/cloud_adapter/models/backbones/reins.py new file mode 100644 index 0000000..a06ada7 --- /dev/null +++ b/cloud_adapter/models/backbones/reins.py @@ -0,0 +1,145 @@ +from mmseg.models.builder import MODELS +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from functools import reduce +from operator import mul +from torch import Tensor + + +@MODELS.register_module() +class Reins(nn.Module): + def __init__( + self, + num_layers: int, + embed_dims: int, + patch_size: int, + query_dims: int = 256, + token_length: int = 100, + use_softmax: bool = True, + link_token_to_query: bool = True, + scale_init: float = 0.001, + zero_mlp_delta_f: bool = False, + ) -> None: + super().__init__() + self.num_layers = num_layers + self.embed_dims = embed_dims + self.patch_size = patch_size + self.query_dims = query_dims + self.token_length = token_length + self.link_token_to_query = link_token_to_query + self.scale_init = scale_init + self.use_softmax = use_softmax + self.zero_mlp_delta_f = zero_mlp_delta_f + self.create_model() + + def create_model(self): + self.learnable_tokens = nn.Parameter( + torch.empty([self.num_layers, self.token_length, self.embed_dims]) + ) + val = math.sqrt( + 6.0 + / float( + 3 * reduce(mul, (self.patch_size, self.patch_size), 1) + self.embed_dims + ) + ) + nn.init.uniform_(self.learnable_tokens.data, -val, val) + nn.init.kaiming_uniform_(self.mlp_delta_f.weight, a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.mlp_token2feat.weight, a=math.sqrt(5)) + self.transform = nn.Linear(self.embed_dims, self.query_dims) + self.merge = nn.Linear(self.query_dims * 3, self.query_dims) + if self.zero_mlp_delta_f: + del self.scale + self.scale = 1.0 + nn.init.zeros_(self.mlp_delta_f.weight) + nn.init.zeros_(self.mlp_delta_f.bias) + + def return_auto(self, feats): + if self.link_token_to_query: + tokens = self.transform(self.get_tokens(-1)).permute(1, 2, 0) + tokens = torch.cat( + [ + F.max_pool1d(tokens, kernel_size=self.num_layers), + F.avg_pool1d(tokens, kernel_size=self.num_layers), + tokens[:, :, -1].unsqueeze(-1), + ], + dim=-1, + ) + querys = self.merge(tokens.flatten(-2, -1)) + return feats, querys + else: + return feats + + def get_tokens(self, layer: int) -> Tensor: + if layer == -1: + # return all + return self.learnable_tokens + else: + return self.learnable_tokens[layer] + + def forward( + self, feats: Tensor, layer: int, batch_first=False, has_cls_token=True + ) -> Tensor: + if batch_first: + feats = feats.permute(1, 0, 2) + if has_cls_token: + cls_token, feats = torch.tensor_split(feats, [1], dim=0) + tokens = self.get_tokens(layer) + delta_feat = self.forward_delta_feat( + feats, + tokens, + layer, + ) + delta_feat = delta_feat * self.scale + feats = feats + delta_feat + if has_cls_token: + feats = torch.cat([cls_token, feats], dim=0) + if batch_first: + feats = feats.permute(1, 0, 2) + return feats + + def forward_delta_feat(self, feats: Tensor, tokens: Tensor, layers: int) -> Tensor: + attn = torch.einsum("nbc,mc->nbm", feats, tokens) + if self.use_softmax: + attn = attn * (self.embed_dims**-0.5) + attn = F.softmax(attn, dim=-1) + delta_f = torch.einsum( + "nbm,mc->nbc", + attn[:, :, 1:], + self.mlp_token2feat(tokens[1:, :]), + ) + delta_f = self.mlp_delta_f(delta_f + feats) + return delta_f + + +@MODELS.register_module() +class LoRAReins(Reins): + def __init__(self, lora_dim=16, **kwargs): + self.lora_dim = lora_dim + super().__init__(**kwargs) + + def create_model(self): + super().create_model() + del self.learnable_tokens + self.learnable_tokens_a = nn.Parameter( + torch.empty([self.num_layers, self.token_length, self.lora_dim]) + ) + self.learnable_tokens_b = nn.Parameter( + torch.empty([self.num_layers, self.lora_dim, self.embed_dims]) + ) + val = math.sqrt( + 6.0 + / float( + 3 * reduce(mul, (self.patch_size, self.patch_size), 1) + + (self.embed_dims * self.lora_dim) ** 0.5 + ) + ) + nn.init.uniform_(self.learnable_tokens_a.data, -val, val) + nn.init.uniform_(self.learnable_tokens_b.data, -val, val) + + def get_tokens(self, layer): + if layer == -1: + return self.learnable_tokens_a @ self.learnable_tokens_b + else: + return self.learnable_tokens_a[layer] @ self.learnable_tokens_b[layer] diff --git a/cloud_adapter/models/backbones/reins_clip.py b/cloud_adapter/models/backbones/reins_clip.py new file mode 100644 index 0000000..14ba2f4 --- /dev/null +++ b/cloud_adapter/models/backbones/reins_clip.py @@ -0,0 +1,95 @@ +from mmseg.models.builder import BACKBONES, MODELS +from .reins import Reins +from .clip import CLIPVisionTransformer +from .utils import set_requires_grad, set_train +import torch +import torch.nn.functional as F + + +@BACKBONES.register_module() +class ReinsCLIPVisionTransformer(CLIPVisionTransformer): + def __init__( + self, + reins_config=None, + **kwargs, + ): + super().__init__(**kwargs) + self.reins: Reins = MODELS.build(reins_config) + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + B, C, H, W = x.shape + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + x = torch.cat( + [ + self.class_embedding.to(x.dtype) + + torch.zeros( + x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device + ), + x, + ], + dim=1, + ) # shape = [*, grid ** 2 + 1, width] + + pos = self.positional_embedding.to(x.dtype) + cls_pos = pos[0, :] + self.class_embedding.to(x.dtype) + spatial_pos = F.interpolate( + pos[1:,] + .reshape(1, self.spatial_size, self.spatial_size, C) + .permute(0, 3, 1, 2), + size=(H, W), + mode="bilinear", + ) + spatial_pos = spatial_pos.reshape(1, C, H * W).permute(0, 2, 1) + pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1) + x = x + pos + x = self.ln_pre(x) + x = x.permute(1, 0, 2) # NLD -> LND + + features = [] + for i, blk in enumerate(self.transformer.resblocks): + x = blk(x) + x = self.reins.forward(x, i, batch_first=False, has_cls_token=True) + if i in self.out_indices: + xp = x.permute(1, 0, 2)[:, 1:, :].permute(0, 2, 1).reshape(B, -1, H, W) + features.append(xp.contiguous()) + ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] + for i in range(len(features)): + features[i] = ops[i](features[i]) + if self.get_embeddings: + x = x.permute(1, 0, 2) + x = self.ln_post(x) + x = x @ self.proj + + global_embedding = x[:, 0] + visual_embedding = ( + x[:, 1:].reshape(B, H, W, -1).permute(0, 3, 1, 2) + ) # B C H W + + features.append([global_embedding, visual_embedding]) + # features[0] = F.interpolate( + # features[0], scale_factor=4, mode="bilinear", align_corners=False + # ) + # features[1] = F.interpolate( + # features[1], scale_factor=2, mode="bilinear", align_corners=False + # ) + # features[3] = F.interpolate( + # features[3], scale_factor=0.5, mode="bilinear", align_corners=False + # ) + return self.reins.return_auto(tuple(features)) + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["fpn", "reins"]) + set_train(self, ["fpn", "reins"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if ("rein" not in k) and ('fpn' not in k)] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/cloud_adapter/models/backbones/reins_convnext.py b/cloud_adapter/models/backbones/reins_convnext.py new file mode 100644 index 0000000..bfba849 --- /dev/null +++ b/cloud_adapter/models/backbones/reins_convnext.py @@ -0,0 +1,66 @@ +from mmpretrain.models.backbones import ConvNeXt +from mmseg.models.builder import BACKBONES, MODELS +from .reins import Reins +from .utils import set_requires_grad, set_train +from typing import List, Dict +import torch.nn as nn + + +@BACKBONES.register_module() +class ReinsConvNeXt(ConvNeXt): + def __init__( + self, + distinct_cfgs: List[Dict] = None, + reins_config: Dict = None, + **kwargs, + ): + super().__init__(**kwargs) + self.reins: List[Reins] = nn.ModuleList() + for cfgs in distinct_cfgs: + reins_config.update(cfgs) + self.reins.append(MODELS.build(reins_config)) + + def forward(self, x): + outs = [] + for i, stage in enumerate(self.stages): + x = self.downsample_layers[i](x) + for idx_sublayer, sublayer in enumerate(stage): + x = sublayer(x) + B, C, H, W = x.shape + x = ( + self.reins[i] + .forward( + x.flatten(-2, -1).permute(0, 2, 1), + idx_sublayer, + batch_first=True, + has_cls_token=False, + ) + .permute(0, 2, 1) + .reshape(B, C, H, W) + ) + if i in self.out_indices: + norm_layer = getattr(self, f"norm{i}") + if self.gap_before_final_norm: + gap = x.mean([-2, -1], keepdim=True) + outs.append(self.reins[i].return_auto(norm_layer(gap).flatten(1))) + else: + # The output of LayerNorm2d may be discontiguous, which + # may cause some problem in the downstream tasks + outs.append(self.reins[i].return_auto(norm_layer(x).contiguous())) + + return [f1 for f1, _ in outs], sum([f2 for _, f2 in outs]) + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["reins"]) + set_train(self, ["reins"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "rein" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/cloud_adapter/models/backbones/reins_dinov2.py b/cloud_adapter/models/backbones/reins_dinov2.py new file mode 100644 index 0000000..e765708 --- /dev/null +++ b/cloud_adapter/models/backbones/reins_dinov2.py @@ -0,0 +1,49 @@ +from mmseg.models.builder import BACKBONES, MODELS +from .reins import Reins +from .dino_v2 import DinoVisionTransformer +from .utils import set_requires_grad, set_train + + +@BACKBONES.register_module() +class ReinsDinoVisionTransformer(DinoVisionTransformer): + def __init__( + self, + reins_config=None, + **kwargs, + ): + super().__init__(**kwargs) + self.reins: Reins = MODELS.build(reins_config) + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + H, W = h // self.patch_size, w // self.patch_size + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + x = self.reins.forward( + x, + idx, + batch_first=True, + has_cls_token=True, + ) + if idx in self.out_indices: + outs.append( + x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, H, W).contiguous() + ) + return self.reins.return_auto(outs) + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["reins"]) + set_train(self, ["reins"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "rein" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/cloud_adapter/models/backbones/reins_eva_02.py b/cloud_adapter/models/backbones/reins_eva_02.py new file mode 100644 index 0000000..018d947 --- /dev/null +++ b/cloud_adapter/models/backbones/reins_eva_02.py @@ -0,0 +1,69 @@ +from .eva_02 import EVA2 +from mmseg.models.builder import BACKBONES, MODELS +from .reins import Reins +import torch +import torch.utils.checkpoint as checkpoint +import torch.nn.functional as F +from .utils import set_requires_grad, set_train + + +@BACKBONES.register_module() +class ReinsEVA2(EVA2): + def __init__(self, reins_config=None, **kwargs): + super().__init__(**kwargs) + self.reins: Reins = MODELS.build(reins_config) + + def forward_features(self, x): + B, C, H, W = x.shape + x, (Hp, Wp) = self.patch_embed(x) + batch_size, seq_len, _ = x.size() + + cls_tokens = self.cls_token.expand( + batch_size, -1, -1 + ) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + features = [] + for i, blk in enumerate(self.blocks): + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, rel_pos_bias) + else: + x = blk(x, rel_pos_bias) + x = self.reins.forward( + x, + i, + batch_first=True, + has_cls_token=True, + ) + if i in self.out_indices: + xp = x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, Hp, Wp) + features.append(xp.contiguous()) + features[0] = F.interpolate( + features[0], scale_factor=4, mode="bilinear", align_corners=False + ) + features[1] = F.interpolate( + features[1], scale_factor=2, mode="bilinear", align_corners=False + ) + features[3] = F.interpolate( + features[3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + return self.reins.return_auto(features) + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["reins"]) + set_train(self, ["reins"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "rein" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/cloud_adapter/models/backbones/reins_resnet.py b/cloud_adapter/models/backbones/reins_resnet.py new file mode 100644 index 0000000..5a01826 --- /dev/null +++ b/cloud_adapter/models/backbones/reins_resnet.py @@ -0,0 +1,65 @@ +from mmseg.models.builder import BACKBONES, MODELS +from .reins import Reins +from mmseg.models.backbones import ResNetV1c +from .utils import set_requires_grad, set_train +from typing import List, Dict +import torch.nn as nn + + +@BACKBONES.register_module() +class ReinsResNetV1c(ResNetV1c): + def __init__( + self, + distinct_cfgs: List[Dict] = None, + reins_config: Dict = None, + **kwargs, + ): + super().__init__(**kwargs) + self.reins: List[Reins] = nn.ModuleList() + for cfgs in distinct_cfgs: + reins_config.update(cfgs) + self.reins.append(MODELS.build(reins_config)) + + def forward(self, x): + if self.deep_stem: + x = self.stem(x) + else: + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + for idx_sublayer, sublayer in enumerate(res_layer): + x = sublayer(x) + B, C, H, W = x.shape + x = ( + self.reins[i] + .forward( + x.flatten(-2, -1).permute(0, 2, 1), + idx_sublayer, + batch_first=True, + has_cls_token=False, + ) + .permute(0, 2, 1) + .reshape(B, C, H, W) + ) + if i in self.out_indices: + outs.append(self.reins[i].return_auto(x)) + return [f1 for f1, _ in outs], sum([f2 for _, f2 in outs]) + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["reins"]) + set_train(self, ["reins"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "rein" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/cloud_adapter/models/backbones/reins_sam_vit.py b/cloud_adapter/models/backbones/reins_sam_vit.py new file mode 100644 index 0000000..fe1359b --- /dev/null +++ b/cloud_adapter/models/backbones/reins_sam_vit.py @@ -0,0 +1,64 @@ +from mmseg.models.builder import BACKBONES, MODELS +from .reins import Reins +from .sam_vit import SAMViT +from .utils import set_requires_grad, set_train +import torch +import torch.nn.functional as F + + +@BACKBONES.register_module() +class ReinsSAMViT(SAMViT): + def __init__( + self, + reins_config=None, + **kwargs, + ): + super().__init__(**kwargs) + self.rein_enabled_layers: list = kwargs.get("global_attn_indexes") + self.reins: Reins = MODELS.build(reins_config) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, C, H, W = x.shape + x = self.patch_embed(x) + Hp, Wp = H // self.patch_size, W // self.patch_size + if self.pos_embed is not None: + x = x + self.pos_embed + features = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + B, H, W, C = x.shape + if idx in self.rein_enabled_layers: + x = self.reins.forward( + x.view(B, -1, C), + self.rein_enabled_layers.index(idx), + batch_first=True, + has_cls_token=False, + ).view(B, H, W, C) + # 4,32,32,768 + if idx in self.out_indices: + features.append(x.permute(0, 3, 1, 2)) + features[0] = F.interpolate( + features[0], scale_factor=4, mode="bilinear", align_corners=False + ) + features[1] = F.interpolate( + features[1], scale_factor=2, mode="bilinear", align_corners=False + ) + features[3] = F.interpolate( + features[3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + return self.reins.return_auto(tuple(features)) + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["reins"]) + set_train(self, ["reins"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "rein" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/cloud_adapter/models/backbones/sam_vit.py b/cloud_adapter/models/backbones/sam_vit.py new file mode 100644 index 0000000..cdeac6d --- /dev/null +++ b/cloud_adapter/models/backbones/sam_vit.py @@ -0,0 +1,467 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from typing import Optional, Tuple, Type +from mmseg.models.builder import BACKBONES +from mmengine.model import BaseModule +from functools import partial + + +class MLPBlock(nn.Module): + def __init__( + self, + embedding_dim: int, + mlp_dim: int, + act: Type[nn.Module] = nn.GELU, + ) -> None: + super().__init__() + self.lin1 = nn.Linear(embedding_dim, mlp_dim) + self.lin2 = nn.Linear(mlp_dim, embedding_dim) + self.act = act() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.lin2(self.act(self.lin1(x))) + + +# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa +# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa +class LayerNorm2d(nn.Module): + def __init__(self, num_channels: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(num_channels)) + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa +@BACKBONES.register_module() +class SAMViT(BaseModule): + def __init__( + self, + img_size: int = 1024, + out_indices=[3, 5, 7, 11], + patch_size: int = 16, + in_chans: int = 3, + embed_dim: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + norm_layer: Type[nn.Module] = partial(nn.LayerNorm, eps=1e-6), + act_layer: Type[nn.Module] = nn.GELU, + use_abs_pos: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + global_attn_indexes: Tuple[int, ...] = (), + init_cfg=None, + ) -> None: + """ + Args: + img_size (int): Input image size. + patch_size (int): Patch size. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + depth (int): Depth of ViT. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_abs_pos (bool): If True, use absolute positional embeddings. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. + global_attn_indexes (list): Indexes for blocks using global attention. + """ + super().__init__(init_cfg) + self.img_size = img_size + self.out_indices = out_indices + self.patch_size = patch_size + self.patch_embed = PatchEmbed( + kernel_size=(patch_size, patch_size), + stride=(patch_size, patch_size), + in_chans=in_chans, + embed_dim=embed_dim, + ) + + self.pos_embed: Optional[nn.Parameter] = None + if use_abs_pos: + # Initialize absolute positional embedding with pretrain image size. + self.pos_embed = nn.Parameter( + torch.zeros( + 1, img_size // patch_size, img_size // patch_size, embed_dim + ) + ) + + self.blocks = nn.ModuleList() + for i in range(depth): + block = Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + act_layer=act_layer, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + window_size=window_size if i not in global_attn_indexes else 0, + input_size=(img_size // patch_size, img_size // patch_size), + ) + self.blocks.append(block) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, C, H, W = x.shape + x = self.patch_embed(x) + Hp, Wp = H // self.patch_size, W // self.patch_size + if self.pos_embed is not None: + x = x + self.pos_embed + features = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + # 4,32,32,768 + if idx in self.out_indices: + features.append(x.permute(0, 3, 1, 2)) + features[0] = F.interpolate( + features[0], scale_factor=4, mode="bilinear", align_corners=False + ) + features[1] = F.interpolate( + features[1], scale_factor=2, mode="bilinear", align_corners=False + ) + features[3] = F.interpolate( + features[3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + return tuple(features) + + +class Block(nn.Module): + """Transformer blocks with support of window attention and residual propagation blocks""" + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + norm_layer: Type[nn.Module] = nn.LayerNorm, + act_layer: Type[nn.Module] = nn.GELU, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + input_size: Optional[Tuple[int, int]] = None, + ) -> None: + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. If it equals 0, then + use global attention. + input_size (tuple(int, int) or None): Input resolution for calculating the relative + positional parameter size. + """ + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + input_size=input_size if window_size == 0 else (window_size, window_size), + global_attn=window_size == 0, + ) + + self.norm2 = norm_layer(dim) + self.mlp = MLPBlock( + embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer + ) + + self.window_size = window_size + + def forward(self, x: torch.Tensor) -> torch.Tensor: + shortcut = x + x = self.norm1(x) + # Window partition + if self.window_size > 0: + H, W = x.shape[1], x.shape[2] + x, pad_hw = window_partition(x, self.window_size) + + x = self.attn(x) + # Reverse window partition + if self.window_size > 0: + x = window_unpartition(x, self.window_size, pad_hw, (H, W)) + + x = shortcut + x + x = x + self.mlp(self.norm2(x)) + + return x + + +class Attention(nn.Module): + """Multi-head Attention block with relative position embeddings.""" + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + input_size: Optional[Tuple[int, int]] = None, + global_attn: int = False, + ) -> None: + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + input_size (tuple(int, int) or None): Input resolution for calculating the relative + positional parameter size. + """ + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.proj = nn.Linear(dim, dim) + + self.use_rel_pos = use_rel_pos + if self.use_rel_pos: + assert ( + input_size is not None + ), "Input size must be provided if using relative positional encoding." + # initialize relative positional embeddings + # use input_size=1024 + scale = 4 + if input_size[0] == 16: + scale = 8 + if global_attn: + self.rel_pos_h = nn.Parameter( + torch.zeros(scale * input_size[0] - 1, head_dim) + ) + self.rel_pos_w = nn.Parameter( + torch.zeros(scale * input_size[1] - 1, head_dim) + ) + else: + self.rel_pos_h = nn.Parameter( + torch.zeros(2 * input_size[0] - 1, head_dim) + ) + self.rel_pos_w = nn.Parameter( + torch.zeros(2 * input_size[1] - 1, head_dim) + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, H, W, _ = x.shape + # qkv with shape (3, B, nHead, H * W, C) + qkv = ( + self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + ) + # q, k, v with shape (B * nHead, H * W, C) + q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0) + + attn = (q * self.scale) @ k.transpose(-2, -1) + + if self.use_rel_pos: + attn = add_decomposed_rel_pos( + attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W) + ) + + attn = attn.softmax(dim=-1) + x = ( + (attn @ v) + .view(B, self.num_heads, H, W, -1) + .permute(0, 2, 3, 1, 4) + .reshape(B, H, W, -1) + ) + x = self.proj(x) + + return x + + +def window_partition( + x: torch.Tensor, window_size: int +) -> Tuple[torch.Tensor, Tuple[int, int]]: + """ + Partition into non-overlapping windows with padding if needed. + Args: + x (tensor): input tokens with [B, H, W, C]. + window_size (int): window size. + + Returns: + windows: windows after partition with [B * num_windows, window_size, window_size, C]. + (Hp, Wp): padded height and width before partition + """ + B, H, W, C = x.shape + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + if pad_h > 0 or pad_w > 0: + x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) + Hp, Wp = H + pad_h, W + pad_w + + x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + ) + return windows, (Hp, Wp) + + +def window_unpartition( + windows: torch.Tensor, + window_size: int, + pad_hw: Tuple[int, int], + hw: Tuple[int, int], +) -> torch.Tensor: + """ + Window unpartition into original sequences and removing padding. + Args: + windows (tensor): input tokens with [B * num_windows, window_size, window_size, C]. + window_size (int): window size. + pad_hw (Tuple): padded height and width (Hp, Wp). + hw (Tuple): original height and width (H, W) before padding. + + Returns: + x: unpartitioned sequences with [B, H, W, C]. + """ + Hp, Wp = pad_hw + H, W = hw + B = windows.shape[0] // (Hp * Wp // window_size // window_size) + x = windows.view( + B, Hp // window_size, Wp // window_size, window_size, window_size, -1 + ) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) + + if Hp > H or Wp > W: + x = x[:, :H, :W, :].contiguous() + return x + + +def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: + """ + Get relative positional embeddings according to the relative positions of + query and key sizes. + Args: + q_size (int): size of query q. + k_size (int): size of key k. + rel_pos (Tensor): relative position embeddings (L, C). + + Returns: + Extracted positional embeddings according to relative positions. + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + # Interpolate rel pos. + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), + size=max_rel_dist, + mode="linear", + ) + rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) + else: + rel_pos_resized = rel_pos + + # Scale the coords with short length if shapes for q and k are different. + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) + + return rel_pos_resized[relative_coords.long()] + + +def add_decomposed_rel_pos( + attn: torch.Tensor, + q: torch.Tensor, + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, + q_size: Tuple[int, int], + k_size: Tuple[int, int], +) -> torch.Tensor: + """ + Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. + https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 + Args: + attn (Tensor): attention map. + q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). + rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. + rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. + q_size (Tuple): spatial sequence size of query q with (q_h, q_w). + k_size (Tuple): spatial sequence size of key k with (k_h, k_w). + + Returns: + attn (Tensor): attention map with added relative positional embeddings. + """ + q_h, q_w = q_size + k_h, k_w = k_size + Rh = get_rel_pos(q_h, k_h, rel_pos_h) + Rw = get_rel_pos(q_w, k_w, rel_pos_w) + + B, _, dim = q.shape + r_q = q.reshape(B, q_h, q_w, dim) + rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) + rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) + + attn = ( + attn.view(B, q_h, q_w, k_h, k_w) + + rel_h[:, :, :, :, None] + + rel_w[:, :, :, None, :] + ).view(B, q_h * q_w, k_h * k_w) + + return attn + + +class PatchEmbed(nn.Module): + """ + Image to Patch Embedding. + """ + + def __init__( + self, + kernel_size: Tuple[int, int] = (16, 16), + stride: Tuple[int, int] = (16, 16), + padding: Tuple[int, int] = (0, 0), + in_chans: int = 3, + embed_dim: int = 768, + ) -> None: + """ + Args: + kernel_size (Tuple): kernel size of the projection layer. + stride (Tuple): stride of the projection layer. + padding (Tuple): padding size of the projection layer. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + """ + super().__init__() + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + # B C H W -> B H W C + x = x.permute(0, 2, 3, 1) + return x diff --git a/cloud_adapter/models/backbones/utils.py b/cloud_adapter/models/backbones/utils.py new file mode 100644 index 0000000..0367331 --- /dev/null +++ b/cloud_adapter/models/backbones/utils.py @@ -0,0 +1,58 @@ +import torch.nn as nn +from typing import List +from mmengine.logging import MMLogger + +first_set_requires_grad = True +first_set_train = True + + +def set_requires_grad(model: nn.Module, keywords: List[str]): + """ + notice:key in name! + """ + requires_grad_names = [] + num_params = 0 + num_trainable = 0 + for name, param in model.named_parameters(): + num_params += param.numel() + if any(key in name for key in keywords): + param.requires_grad = True + requires_grad_names.append(name) + num_trainable += param.numel() + else: + param.requires_grad = False + global first_set_requires_grad + if first_set_requires_grad: + logger = MMLogger.get_current_instance() + for name in requires_grad_names: + logger.info(f"set_requires_grad----{name}") + logger.info( + f"Total trainable params--{num_trainable}, All params--{num_params}, Ratio--{num_trainable*100/num_params:.1f}%" + ) + first_set_requires_grad = False + + +def _set_train(model: nn.Module, keywords: List[str], prefix: str = ""): + train_names = [] + for name, child in model.named_children(): + fullname = ".".join([prefix, name]) + if any(name.startswith(key) for key in keywords): + train_names.append(fullname) + child.train() + else: + train_names += _set_train(child, keywords, prefix=fullname) + return train_names + + +def set_train(model: nn.Module, keywords: List[str]): + """ + notice:sub name startwith key! + """ + model.train(False) + train_names = _set_train(model, keywords) + global first_set_train + if first_set_train: + logger = MMLogger.get_current_instance() + for train_name in train_names: + logger.info(f"set_train----{train_name}") + first_set_train = False \ No newline at end of file diff --git a/cloud_adapter/models/backbones/vitadapter_dinov2.py b/cloud_adapter/models/backbones/vitadapter_dinov2.py new file mode 100644 index 0000000..365833a --- /dev/null +++ b/cloud_adapter/models/backbones/vitadapter_dinov2.py @@ -0,0 +1,183 @@ +# Copyright (c) Shanghai AI Lab. All rights reserved. +import logging +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmseg.models.builder import BACKBONES +# from ops.modules import MSDeformAttn + +from cloud_adapter.models.backbones import DinoVisionTransformer +from mmcv.ops import MultiScaleDeformableAttention as MSDeformAttn +from timm.models.layers import DropPath, trunc_normal_ +from torch.nn.init import normal_ +from .utils import set_requires_grad, set_train +from cloud_adapter.models.backbones.adapter_modules import SpatialPriorModule, InteractionBlock, deform_inputs + +_logger = logging.getLogger(__name__) + + +@BACKBONES.register_module() +class ViTAdapter(DinoVisionTransformer): + def __init__(self, pretrain_size=224,conv_inplane=64, n_points=4, deform_num_heads=6, + init_values=0., interaction_indexes=None, with_cffn=True, cffn_ratio=0.25,drop_rate=0., + deform_ratio=1.0, add_vit_feature=True, use_extra_extractor=True, **kwargs): + + super().__init__(**kwargs) + + # self.num_classes = 80 + self.cls_token = None + self.num_block = len(self.blocks) + self.pretrain_size = (pretrain_size, pretrain_size) + self.interaction_indexes = interaction_indexes + self.add_vit_feature = add_vit_feature + embed_dim = self.embed_dim + self.pos_drop = nn.Dropout(p=drop_rate) + + self.level_embed = nn.Parameter(torch.zeros(3, embed_dim)) + self.spm = SpatialPriorModule(inplanes=conv_inplane, + embed_dim=embed_dim) + self.interactions = nn.Sequential(*[ + InteractionBlock(dim=embed_dim, num_heads=deform_num_heads, n_points=n_points, + init_values=init_values, drop_path=self.drop_path_rate, + norm_layer=self.norm_layer, with_cffn=with_cffn, + cffn_ratio=cffn_ratio, deform_ratio=deform_ratio, + extra_extractor=((True if i == len(interaction_indexes) - 1 else False) and use_extra_extractor)) + for i in range(len(interaction_indexes)) + ]) + self.up = nn.ConvTranspose2d(embed_dim, embed_dim, 2, 2) + self.adapter_norm1 = nn.SyncBatchNorm(embed_dim) + self.adapter_norm2 = nn.SyncBatchNorm(embed_dim) + self.adapter_norm3 = nn.SyncBatchNorm(embed_dim) + self.adapter_norm4 = nn.SyncBatchNorm(embed_dim) + + self.up.apply(self._init_weights) + self.spm.apply(self._init_weights) + self.interactions.apply(self._init_weights) + self.apply(self._init_deform_weights) + normal_(self.level_embed) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm) or isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def _get_pos_embed(self, pos_embed, H, W): + pos_embed = pos_embed.reshape( + 1, self.pretrain_size[0] // 16, self.pretrain_size[1] // 16, -1).permute(0, 3, 1, 2) + pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False).\ + reshape(1, -1, H * W).permute(0, 2, 1) + return pos_embed + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["level_embed","pos_drop","spm","interactions","up","adapter_norm1","adapter_norm2","adapter_norm3","adapter_norm4"]) + set_train(self, ["level_embed","pos_drop","spm","interactions","up","adapter_norm1","adapter_norm2","adapter_norm3","adapter_norm4"]) + + # def state_dict(self, destination, prefix, keep_vars): + # state = super().state_dict(destination, prefix, keep_vars) + # keys = [k for k in state.keys() if "loracacheadapter" not in k] + # for key in keys: + # state.pop(key) + # if key in destination: + # destination.pop(key) + # return state + + def _init_deform_weights(self, m): + pass + # if isinstance(m, MSDeformAttn): + # m._reset_parameters() + + def _add_level_embed(self, c2, c3, c4): + c2 = c2 + self.level_embed[0] + c3 = c3 + self.level_embed[1] + c4 = c4 + self.level_embed[2] + return c2, c3, c4 + + def forward(self, x): + B, _, h, w = x.shape + H,W = h//self.patch_size, w//self.patch_size + deform_inputs1, deform_inputs2 = deform_inputs(x) + + # SPM forward + c1, c2, c3, c4 = self.spm(x) + # print(c1.shape) # [2, 1024, 128, 128]) + # print(c2.shape) # [2, 4096, 1024] + # print(c3.shape) # [2, 1024, 1024] + # print(c4.shape) # [2, 256, 1024] + c2, c3, c4 = self._add_level_embed(c2, c3, c4) + c = torch.cat([c2, c3, c4], dim=1) + + # Patch Embedding forward + x = self.patch_embed(x) + bs, n, dim = x.shape + pos_embed = self._get_pos_embed(self.pos_embed[:, 1:], H, W) + x = self.pos_drop(x + pos_embed) + # print(x.shape) # [2, 1024, 1024] + + # Interaction + for i, layer in enumerate(self.interactions): + indexes = self.interaction_indexes[i] + x, c = layer(x, c, self.blocks[indexes[0]:indexes[-1] + 1], + deform_inputs1, deform_inputs2, H, W) + + # Split & Reshape + c2 = c[:, 0:c2.size(1), :] + c3 = c[:, c2.size(1):c2.size(1) + c3.size(1), :] + c4 = c[:, c2.size(1) + c3.size(1):, :] + + c2 = c2.transpose(1, 2).view(bs, dim, H * 2, W * 2).contiguous() + c3 = c3.transpose(1, 2).view(bs, dim, H, W).contiguous() + c4 = c4.transpose(1, 2).view(bs, dim, H // 2, W // 2).contiguous() + c1 = self.up(c2) + c1 + + if self.add_vit_feature: + x3 = x.transpose(1, 2).view(bs, dim, H, W).contiguous() + x1 = F.interpolate(x3, scale_factor=4, mode='bilinear', align_corners=False) + x2 = F.interpolate(x3, scale_factor=2, mode='bilinear', align_corners=False) + x4 = F.interpolate(x3, scale_factor=0.5, mode='bilinear', align_corners=False) + c1, c2, c3, c4 = c1 + x1, c2 + x2, c3 + x3, c4 + x4 + + # Final Norm + f1 = self.adapter_norm1(c1) + f2 = self.adapter_norm2(c2) + f3 = self.adapter_norm3(c3) + f4 = self.adapter_norm4(c4) + return [f1, f2, f3, f4] + +if __name__ == "__main__": + device = "cuda:2" + model = ViTAdapter( + pretrain_size=512, + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + + interaction_indexes=[[0, 5], [6, 11], [12, 17], [18, 23]], + deform_num_heads=16, + ).to(device) + inp = torch.randn((2,3,512,512)).to(device) + features = model(inp) + for feature in features: + print(feature.shape) \ No newline at end of file diff --git a/cloud_adapter/models/segmentors/__init__.py b/cloud_adapter/models/segmentors/__init__.py new file mode 100644 index 0000000..be67759 --- /dev/null +++ b/cloud_adapter/models/segmentors/__init__.py @@ -0,0 +1,3 @@ +from .frozen_encoder_decoder import FrozenBackboneEncoderDecoder + +__all__ = ["FrozenBackboneEncoderDecoder"] diff --git a/cloud_adapter/models/segmentors/frozen_encoder_decoder.py b/cloud_adapter/models/segmentors/frozen_encoder_decoder.py new file mode 100644 index 0000000..703b3bd --- /dev/null +++ b/cloud_adapter/models/segmentors/frozen_encoder_decoder.py @@ -0,0 +1,44 @@ +from typing import List +import torch +from torch import Tensor + +from mmseg.registry import MODELS +from mmseg.models.segmentors import EncoderDecoder +from typing import Iterable + + +def detach_everything(everything): + if isinstance(everything, Tensor): + return everything.detach() + elif isinstance(everything, Iterable): + return [detach_everything(x) for x in everything] + else: + return everything + + +@MODELS.register_module() +class FrozenBackboneEncoderDecoder(EncoderDecoder): + def train(self, mode=True): + super().train(mode) + self.backbone.eval() + for param in self.backbone.parameters(): + param.requires_grad = False + + def extract_feat(self, inputs: Tensor) -> List[Tensor]: + """Extract features from images.""" + with torch.no_grad(): + x = self.backbone(inputs) + x = detach_everything(x) + if self.with_neck: + x = self.neck(x) + return x + +@MODELS.register_module() +class FrozenHeadEncoderDecoder(EncoderDecoder): + def train(self, mode=True): + super().train(mode) + + self.decode_head.eval() + for param in self.decode_head.parameters(): + param.requires_grad = False + \ No newline at end of file diff --git a/cloud_adapter/optimizers/__init__.py b/cloud_adapter/optimizers/__init__.py new file mode 100644 index 0000000..8d88495 --- /dev/null +++ b/cloud_adapter/optimizers/__init__.py @@ -0,0 +1 @@ +from .peft_optimizer_constructor import PEFTOptimWrapperConstructor \ No newline at end of file diff --git a/cloud_adapter/optimizers/peft_optimizer_constructor.py b/cloud_adapter/optimizers/peft_optimizer_constructor.py new file mode 100644 index 0000000..43dca84 --- /dev/null +++ b/cloud_adapter/optimizers/peft_optimizer_constructor.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging +from typing import List, Optional, Union +from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper + +import torch +import torch.nn as nn +from torch.nn import GroupNorm, LayerNorm + +from mmengine.logging import print_log +from mmengine.registry import OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, OPTIMIZERS +from mmengine.utils import is_list_of +from mmengine.utils.dl_utils import mmcv_full_available +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm +from mmengine.optim.optimizer import DefaultOptimWrapperConstructor, OptimWrapper + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class PEFTOptimWrapperConstructor(DefaultOptimWrapperConstructor): + def __init__(self, optim_wrapper_cfg: dict, paramwise_cfg: Optional[dict] = None): + # assert "keywords" in optim_wrapper_cfg + # self.keywords = optim_wrapper_cfg.pop("keywords") + super().__init__(optim_wrapper_cfg, paramwise_cfg) + + def add_params( + self, + params: List[dict], + module: nn.Module, + prefix: str = "", + is_dcn_module: Optional[Union[int, float]] = None, + ) -> None: + # get param-wise options + custom_keys = self.paramwise_cfg.get("custom_keys", {}) + # first sort with alphabet order and then sort with reversed len of str + sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True) + + bias_lr_mult = self.paramwise_cfg.get("bias_lr_mult", None) + bias_decay_mult = self.paramwise_cfg.get("bias_decay_mult", None) + norm_decay_mult = self.paramwise_cfg.get("norm_decay_mult", None) + dwconv_decay_mult = self.paramwise_cfg.get("dwconv_decay_mult", None) + flat_decay_mult = self.paramwise_cfg.get("flat_decay_mult", None) + bypass_duplicate = self.paramwise_cfg.get("bypass_duplicate", False) + dcn_offset_lr_mult = self.paramwise_cfg.get("dcn_offset_lr_mult", None) + + # special rules for norm layers and depth-wise conv layers + is_norm = isinstance(module, (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)) + is_dwconv = ( + isinstance(module, torch.nn.Conv2d) and module.in_channels == module.groups + ) + + for name, param in module.named_parameters(recurse=False): + if not param.requires_grad: + continue + param_group = {"params": [param]} + if bypass_duplicate and self._is_in(param_group, params): + print_log( + f"{prefix} is duplicate. It is skipped since " + f"bypass_duplicate={bypass_duplicate}", + logger="current", + level=logging.WARNING, + ) + continue + if not param.requires_grad: + params.append(param_group) + continue + + # if the parameter match one of the custom keys, ignore other rules + is_custom = False + for key in sorted_keys: + if key in f"{prefix}.{name}": + is_custom = True + lr_mult = custom_keys[key].get("lr_mult", 1.0) + param_group["lr"] = self.base_lr * lr_mult + if self.base_wd is not None: + decay_mult = custom_keys[key].get("decay_mult", 1.0) + param_group["weight_decay"] = self.base_wd * decay_mult + # add custom settings to param_group + for k, v in custom_keys[key].items(): + param_group[k] = v + break + + if not is_custom: + # bias_lr_mult affects all bias parameters + # except for norm.bias dcn.conv_offset.bias + if ( + name == "bias" + and not (is_norm or is_dcn_module) + and bias_lr_mult is not None + ): + param_group["lr"] = self.base_lr * bias_lr_mult + + if ( + prefix.find("conv_offset") != -1 + and is_dcn_module + and dcn_offset_lr_mult is not None + and isinstance(module, torch.nn.Conv2d) + ): + # deal with both dcn_offset's bias & weight + param_group["lr"] = self.base_lr * dcn_offset_lr_mult + + # apply weight decay policies + if self.base_wd is not None: + # norm decay + if is_norm and norm_decay_mult is not None: + param_group["weight_decay"] = self.base_wd * norm_decay_mult + # bias lr and decay + elif ( + name == "bias" + and not is_dcn_module + and bias_decay_mult is not None + ): + param_group["weight_decay"] = self.base_wd * bias_decay_mult + # depth-wise conv + elif is_dwconv and dwconv_decay_mult is not None: + param_group["weight_decay"] = self.base_wd * dwconv_decay_mult + # flatten parameters except dcn offset + elif ( + param.ndim == 1 + and not is_dcn_module + and flat_decay_mult is not None + ): + param_group["weight_decay"] = self.base_wd * flat_decay_mult + params.append(param_group) + for key, value in param_group.items(): + full_name = f"{prefix}.{name}" if prefix else name + if key == "params": + print_log( + f"paramwise_options -- {full_name}:num of {key}={sum(v.numel() for v in value)}", + logger="current", + ) + else: + print_log( + f"paramwise_options -- {full_name}:{key}={value}", + logger="current", + ) + + if mmcv_full_available(): + from mmcv.ops import DeformConv2d, ModulatedDeformConv2d + + is_dcn_module = isinstance(module, (DeformConv2d, ModulatedDeformConv2d)) + else: + is_dcn_module = False + for child_name, child_mod in module.named_children(): + child_prefix = f"{prefix}.{child_name}" if prefix else child_name + self.add_params( + params, child_mod, prefix=child_prefix, is_dcn_module=is_dcn_module + ) + + def __call__(self, model: nn.Module) -> OptimWrapper: + model.train() + if hasattr(model, "module"): + model = model.module + + optim_wrapper_cfg = self.optim_wrapper_cfg.copy() + optim_wrapper_cfg.setdefault("type", "OptimWrapper") + optimizer_cfg = self.optimizer_cfg.copy() + # if no paramwise option is specified, just use the global setting + if not self.paramwise_cfg: + optimizer_cfg["params"] = model.parameters() + optimizer = OPTIMIZERS.build(optimizer_cfg) + else: + # set param-wise lr and weight decay recursively + params: List = [] + self.add_params(params, model) + optimizer_cfg["params"] = params + optimizer = OPTIMIZERS.build(optimizer_cfg) + optim_wrapper = OPTIM_WRAPPERS.build( + optim_wrapper_cfg, default_args=dict(optimizer=optimizer) + ) + return optim_wrapper diff --git a/cloud_adapter/utils/__init__.py b/cloud_adapter/utils/__init__.py new file mode 100644 index 0000000..9fdbc2f --- /dev/null +++ b/cloud_adapter/utils/__init__.py @@ -0,0 +1 @@ +from .init_model import init_model \ No newline at end of file diff --git a/cloud_adapter/utils/class_names.py b/cloud_adapter/utils/class_names.py new file mode 100644 index 0000000..f23a272 --- /dev/null +++ b/cloud_adapter/utils/class_names.py @@ -0,0 +1,47 @@ +from mmengine.utils import is_str + +def hrc_whu_classes(): + return [ + 'clear sky', 'cloud' + ] + +def hrc_whu_palette(): + return [ + [0, 0, 0],[255, 255, 255] + ] +dataset_aliases ={ + 'hrc_whu': ['hrc_whu'], +} + +def get_classes(dataset): + """Get class names of a dataset.""" + alias2name = {} + for name, aliases in dataset_aliases.items(): + for alias in aliases: + alias2name[alias] = name + + if is_str(dataset): + if dataset in alias2name: + labels = eval(alias2name[dataset] + '_classes()') + else: + raise ValueError(f'Unrecognized dataset: {dataset}') + else: + raise TypeError(f'dataset must a str, but got {type(dataset)}') + return labels + + +def get_palette(dataset): + """Get class palette (RGB) of a dataset.""" + alias2name = {} + for name, aliases in dataset_aliases.items(): + for alias in aliases: + alias2name[alias] = name + + if is_str(dataset): + if dataset in alias2name: + labels = eval(alias2name[dataset] + '_palette()') + else: + raise ValueError(f'Unrecognized dataset: {dataset}') + else: + raise TypeError(f'dataset must a str, but got {type(dataset)}') + return labels \ No newline at end of file diff --git a/cloud_adapter/utils/init_model.py b/cloud_adapter/utils/init_model.py new file mode 100644 index 0000000..8f05ecd --- /dev/null +++ b/cloud_adapter/utils/init_model.py @@ -0,0 +1,93 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from collections import defaultdict +from pathlib import Path +from typing import Optional, Sequence, Union +from mmengine import Config +from mmengine.registry import init_default_scope +from mmengine.runner import load_checkpoint +from mmseg.registry import MODELS +from mmseg.utils import SampleList, dataset_aliases, get_classes, get_palette +from cloud_adapter.hooks.load_backbone_hook import load_backbone +import torch + +def init_model(config: Union[str, Path, Config], + checkpoint = None, + device: str = 'cuda:0', + cfg_options: Optional[dict] = None): + """Initialize a segmentor from config file. + + Args: + config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path, + :obj:`Path`, or the config object. + checkpoint: + if type==str: load checkpoint path directly + if type==dict: load checkpoint ['backbone'] and checkpoint['rein_head']. + device (str, optional) CPU/CUDA device option. Default 'cuda:0'. + Use 'cpu' for loading model on CPU. + cfg_options (dict, optional): Options to override some settings in + the used config. + Returns: + nn.Module: The constructed segmentor. + """ + if isinstance(config, (str, Path)): + config = Config.fromfile(config) + elif not isinstance(config, Config): + raise TypeError('config must be a filename or Config object, ' + 'but got {}'.format(type(config))) + if cfg_options is not None: + config.merge_from_dict(cfg_options) + elif 'init_cfg' in config.model.backbone: + config.model.backbone.init_cfg = None + config.model.pretrained = None + config.model.train_cfg = None + init_default_scope(config.get('default_scope', 'mmseg')) + + model = MODELS.build(config.model) + if checkpoint is not None: + if isinstance(checkpoint,str): + checkpoint=torch.load(checkpoint,map_location='cpu') + elif isinstance(checkpoint,dict): + backbone=checkpoint['backbone'] + rein_head=checkpoint['rein_head'] + checkpoint=torch.load(rein_head,map_location='cpu') + load_backbone(checkpoint,backbone) + else: + raise NotImplementedError() + if 'meta' not in checkpoint: + checkpoint['meta']={} + dataset_meta = checkpoint['meta'].get('dataset_meta', None) + # save the dataset_meta in the model for convenience + if 'dataset_meta' in checkpoint.get('meta', {}): + # mmseg 1.x + model.dataset_meta = dataset_meta + elif 'CLASSES' in checkpoint.get('meta', {}): + # < mmseg 1.x + classes = checkpoint['meta']['CLASSES'] + palette = checkpoint['meta']['PALETTE'] + model.dataset_meta = {'classes': classes, 'palette': palette} + else: + warnings.simplefilter('once') + warnings.warn( + 'dataset_meta or class names are not saved in the ' + 'checkpoint\'s meta data, classes and palette will be' + 'set according to num_classes ') + num_classes = model.decode_head.num_classes + dataset_name = None + for name in dataset_aliases.keys(): + if len(get_classes(name)) == num_classes: + dataset_name = name + break + if dataset_name is None: + warnings.warn( + 'No suitable dataset found, use Cityscapes by default') + dataset_name = 'cityscapes' + model.dataset_meta = { + 'classes': get_classes(dataset_name), + 'palette': get_palette(dataset_name) + } + model.cfg = config # save the config in the model for convenience + model.load_state_dict(checkpoint['state_dict']) + model.to(device) + model.eval() + return model \ No newline at end of file diff --git a/configs/_base_/datasets/bdd100k_512x512.py b/configs/_base_/datasets/bdd100k_512x512.py new file mode 100644 index 0000000..a8bee17 --- /dev/null +++ b/configs/_base_/datasets/bdd100k_512x512.py @@ -0,0 +1,42 @@ +bdd_type = "CityscapesDataset" +bdd_root = "data/bdd100k/" +bdd_crop_size = (512, 512) +bdd_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict(type="Resize", scale=(1280, 720)), + dict(type="RandomCrop", crop_size=bdd_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +bdd_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1280, 720), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +train_bdd = dict( + type=bdd_type, + data_root=bdd_root, + data_prefix=dict( + img_path="images/10k/train", + seg_map_path="labels/sem_seg/masks/train", + ), + img_suffix=".jpg", + seg_map_suffix=".png", + pipeline=bdd_train_pipeline, +) +val_bdd = dict( + type=bdd_type, + data_root=bdd_root, + data_prefix=dict( + img_path="images/10k/val", + seg_map_path="labels/sem_seg/masks/val", + ), + img_suffix=".jpg", + seg_map_suffix=".png", + pipeline=bdd_test_pipeline, +) diff --git a/configs/_base_/datasets/cityscapes_1024x1024.py b/configs/_base_/datasets/cityscapes_1024x1024.py new file mode 100644 index 0000000..65bfbc5 --- /dev/null +++ b/configs/_base_/datasets/cityscapes_1024x1024.py @@ -0,0 +1,38 @@ +cityscapes_type = "CityscapesDataset" +cityscapes_root = "data/cityscapes/" +cityscapes_crop_size = (1024, 1024) +cityscapes_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict(type="Resize", scale=(2048, 1024)), + dict(type="RandomCrop", crop_size=cityscapes_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +cityscapes_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(2048, 1024), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +train_cityscapes = dict( + type=cityscapes_type, + data_root=cityscapes_root, + data_prefix=dict( + img_path="leftImg8bit/train", + seg_map_path="gtFine/train", + ), + pipeline=cityscapes_train_pipeline, +) +val_cityscapes = dict( + type=cityscapes_type, + data_root=cityscapes_root, + data_prefix=dict( + img_path="leftImg8bit/val", + seg_map_path="gtFine/val", + ), + pipeline=cityscapes_test_pipeline, +) diff --git a/configs/_base_/datasets/cityscapes_512x512.py b/configs/_base_/datasets/cityscapes_512x512.py new file mode 100644 index 0000000..329a5d8 --- /dev/null +++ b/configs/_base_/datasets/cityscapes_512x512.py @@ -0,0 +1,38 @@ +cityscapes_type = "CityscapesDataset" +cityscapes_root = "data/cityscapes/" +cityscapes_crop_size = (512, 512) +cityscapes_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict(type="Resize", scale=(1024, 512)), + dict(type="RandomCrop", crop_size=cityscapes_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +cityscapes_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1024, 512), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +train_cityscapes = dict( + type=cityscapes_type, + data_root=cityscapes_root, + data_prefix=dict( + img_path="leftImg8bit/train", + seg_map_path="gtFine/train", + ), + pipeline=cityscapes_train_pipeline, +) +val_cityscapes = dict( + type=cityscapes_type, + data_root=cityscapes_root, + data_prefix=dict( + img_path="leftImg8bit/val", + seg_map_path="gtFine/val", + ), + pipeline=cityscapes_test_pipeline, +) diff --git a/configs/_base_/datasets/cloudsen12_high_l1c.py b/configs/_base_/datasets/cloudsen12_high_l1c.py new file mode 100644 index 0000000..c586cd6 --- /dev/null +++ b/configs/_base_/datasets/cloudsen12_high_l1c.py @@ -0,0 +1,62 @@ +dataset_type = 'CLOUDSEN12HIGHL1CDataset' +data_root = 'data/cloudsen12_high_l1c' + +crop_size = (512, 512) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='RandomCrop', crop_size=crop_size), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=crop_size), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type='LoadAnnotations'), + dict(type='PackSegInputs') +] + +train_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='InfiniteSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/train', + seg_map_path='ann_dir/train'), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/test', + seg_map_path='ann_dir/test'), + pipeline=test_pipeline)) + +test_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/test', + seg_map_path='ann_dir/test'), + pipeline=test_pipeline)) +# test_dataloader = val_dataloader + +val_evaluator = dict(type='IoUMetric', iou_metrics=["mIoU", "mDice", "mFscore"],) +test_evaluator = val_evaluator \ No newline at end of file diff --git a/configs/_base_/datasets/cloudsen12_high_l2a.py b/configs/_base_/datasets/cloudsen12_high_l2a.py new file mode 100644 index 0000000..6c63565 --- /dev/null +++ b/configs/_base_/datasets/cloudsen12_high_l2a.py @@ -0,0 +1,62 @@ +dataset_type = 'CLOUDSEN12HIGHL2ADataset' +data_root = 'data/cloudsen12_high_l2a' + +crop_size = (512, 512) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='RandomCrop', crop_size=crop_size), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=crop_size), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type='LoadAnnotations'), + dict(type='PackSegInputs') +] + +train_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='InfiniteSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/train', + seg_map_path='ann_dir/train'), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/test', + seg_map_path='ann_dir/test'), + pipeline=test_pipeline)) + +test_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/test', + seg_map_path='ann_dir/test'), + pipeline=test_pipeline)) +# test_dataloader = val_dataloader + +val_evaluator = dict(type='IoUMetric', iou_metrics=["mIoU", "mDice", "mFscore"],) +test_evaluator = val_evaluator \ No newline at end of file diff --git a/configs/_base_/datasets/dg_citys2acdc_1024x1024.py b/configs/_base_/datasets/dg_citys2acdc_1024x1024.py new file mode 100644 index 0000000..53da1ab --- /dev/null +++ b/configs/_base_/datasets/dg_citys2acdc_1024x1024.py @@ -0,0 +1,58 @@ +_base_ = [ + "./fog-acdc_1024x1024.py", + "./night-acdc_1024x1024.py", + "./rain-acdc_1024x1024.py", + "./snow-acdc_1024x1024.py", + "./cityscapes_1024x1024.py", +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(type="InfiniteSampler", shuffle=True), + dataset={{_base_.train_cityscapes}}, +) +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type="ConcatDataset", + datasets=[ + {{_base_.val_night_acdc}}, + {{_base_.val_snow_acdc}}, + {{_base_.val_fog_acdc}}, + {{_base_.val_rain_acdc}}, + {{_base_.val_cityscapes}}, + ], + ), +) +test_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type="ConcatDataset", + datasets=[ + {{_base_.test_night_acdc}}, + {{_base_.test_snow_acdc}}, + {{_base_.test_fog_acdc}}, + {{_base_.test_rain_acdc}}, + ], + ), +) +val_evaluator = dict( + type="DGIoUMetric", + iou_metrics=["mIoU"], + dataset_keys=["night/", "cityscapes/", "fog/", "snow/", "rain/"], + mean_used_keys=["night/", "fog/", "snow/", "rain/"], +) +test_evaluator = dict( + type="IoUMetric", + iou_metrics=["mIoU"], + format_only=True, + output_dir="work_dirs/format_results", +) diff --git a/configs/_base_/datasets/dg_citys2acdc_512x512.py b/configs/_base_/datasets/dg_citys2acdc_512x512.py new file mode 100644 index 0000000..550f2ca --- /dev/null +++ b/configs/_base_/datasets/dg_citys2acdc_512x512.py @@ -0,0 +1,45 @@ +_base_ = [ + "./fog-acdc_512x512.py", + "./night-acdc_512x512.py", + "./rain-acdc_512x512.py", + "./snow-acdc_512x512.py", + "./cityscapes_512x512.py", +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(type="InfiniteSampler", shuffle=True), + dataset={{_base_.train_cityscapes}}, +) +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type="ConcatDataset", + datasets=[ + {{_base_.val_night_acdc}}, + {{_base_.val_snow_acdc}}, + {{_base_.val_fog_acdc}}, + {{_base_.val_rain_acdc}}, + {{_base_.val_cityscapes}}, + ], + ), +) +test_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset={{_base_.val_cityscapes}}, +) +val_evaluator = dict( + type="DGIoUMetric", + iou_metrics=["mIoU"], + dataset_keys=["night/", "citys/", "fog/", "snow/", "rain/"], + mean_used_keys=["night/", "fog/", "snow/", "rain/"], +) +test_evaluator = dict(type="IoUMetric", iou_metrics=["mIoU"]) diff --git a/configs/_base_/datasets/dg_gta_512x512.py b/configs/_base_/datasets/dg_gta_512x512.py new file mode 100644 index 0000000..e53a10a --- /dev/null +++ b/configs/_base_/datasets/dg_gta_512x512.py @@ -0,0 +1,33 @@ +_base_ = [ + "./gta_512x512.py", + "./bdd100k_512x512.py", + "./cityscapes_512x512.py", + "./mapillary_512x512.py", +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(type="InfiniteSampler", shuffle=True), + dataset={{_base_.train_gta}}, +) +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type="ConcatDataset", + datasets=[ + {{_base_.val_cityscapes}}, + {{_base_.val_bdd}}, + {{_base_.val_mapillary}}, + ], + ), +) +test_dataloader = val_dataloader +val_evaluator = dict( + type="DGIoUMetric", iou_metrics=["mIoU"], dataset_keys=["citys", "map", "bdd"] +) +test_evaluator=val_evaluator diff --git a/configs/_base_/datasets/fog-acdc_1024x1024.py b/configs/_base_/datasets/fog-acdc_1024x1024.py new file mode 100644 index 0000000..bcf7134 --- /dev/null +++ b/configs/_base_/datasets/fog-acdc_1024x1024.py @@ -0,0 +1,66 @@ +fog_acdc_type = "CityscapesDataset" +fog_acdc_root = "data/acdc/" +fog_acdc_crop_size = (1024, 1024) +fog_acdc_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(1080 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size=fog_acdc_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +fog_acdc_val_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1920, 1080), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +fog_acdc_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1920, 1080), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="PackSegInputs"), +] +train_fog_acdc = dict( + type=fog_acdc_type, + data_root=fog_acdc_root, + data_prefix=dict( + img_path="rgb_anon/fog/train", + seg_map_path="gt/fog/train", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=fog_acdc_train_pipeline, +) +val_fog_acdc = dict( + type=fog_acdc_type, + data_root=fog_acdc_root, + data_prefix=dict( + img_path="rgb_anon/fog/val", + seg_map_path="gt/fog/val", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=fog_acdc_val_pipeline, +) + +test_fog_acdc = dict( + type=fog_acdc_type, + data_root=fog_acdc_root, + data_prefix=dict( + img_path="rgb_anon/fog/test", + seg_map_path="gt/fog/test", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=fog_acdc_test_pipeline, +) diff --git a/configs/_base_/datasets/fog-acdc_512x512.py b/configs/_base_/datasets/fog-acdc_512x512.py new file mode 100644 index 0000000..e4a6b5d --- /dev/null +++ b/configs/_base_/datasets/fog-acdc_512x512.py @@ -0,0 +1,47 @@ +fog_acdc_type = "CityscapesDataset" +fog_acdc_root = "data/acdc/" +fog_acdc_crop_size = (512, 512) +fog_acdc_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(540 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size=fog_acdc_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +fog_acdc_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(960, 540), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +train_fog_acdc = dict( + type=fog_acdc_type, + data_root=fog_acdc_root, + data_prefix=dict( + img_path="rgb_anon/fog/train", + seg_map_path="gt/fog/train", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=fog_acdc_train_pipeline, +) +val_fog_acdc = dict( + type=fog_acdc_type, + data_root=fog_acdc_root, + data_prefix=dict( + img_path="rgb_anon/fog/val", + seg_map_path="gt/fog/val", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=fog_acdc_test_pipeline, +) diff --git a/configs/_base_/datasets/gf12ms_whu_gf1.py b/configs/_base_/datasets/gf12ms_whu_gf1.py new file mode 100644 index 0000000..7d85fc0 --- /dev/null +++ b/configs/_base_/datasets/gf12ms_whu_gf1.py @@ -0,0 +1,49 @@ +dataset_type = 'GF12MSWHUGF1Dataset' +data_root = 'data/gf12ms_whu_gf1' + +crop_size = (256, 256) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='RandomCrop', crop_size=crop_size), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=crop_size), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type='LoadAnnotations'), + dict(type='PackSegInputs') +] + +train_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='InfiniteSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/train', + seg_map_path='ann_dir/train'), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/val', + seg_map_path='ann_dir/val'), + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(type='IoUMetric', iou_metrics=["mIoU", "mDice", "mFscore"],) +test_evaluator = val_evaluator \ No newline at end of file diff --git a/configs/_base_/datasets/gf12ms_whu_gf2.py b/configs/_base_/datasets/gf12ms_whu_gf2.py new file mode 100644 index 0000000..404b98f --- /dev/null +++ b/configs/_base_/datasets/gf12ms_whu_gf2.py @@ -0,0 +1,49 @@ +dataset_type = 'GF12MSWHUGF2Dataset' +data_root = 'data/gf12ms_whu_gf2' + +crop_size = (256, 256) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='RandomCrop', crop_size=crop_size), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=crop_size), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type='LoadAnnotations'), + dict(type='PackSegInputs') +] + +train_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='InfiniteSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/train', + seg_map_path='ann_dir/train'), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/val', + seg_map_path='ann_dir/val'), + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(type='IoUMetric', iou_metrics=["mIoU", "mDice", "mFscore"],) +test_evaluator = val_evaluator \ No newline at end of file diff --git a/configs/_base_/datasets/gta_512x512.py b/configs/_base_/datasets/gta_512x512.py new file mode 100644 index 0000000..ce4e405 --- /dev/null +++ b/configs/_base_/datasets/gta_512x512.py @@ -0,0 +1,43 @@ +gta_type = "CityscapesDataset" +gta_root = "data/gta/" +gta_root = "data/gta/" +gta_crop_size = (512, 512) +gta_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict(type="Resize", scale=(1280, 720)), + dict(type="RandomCrop", crop_size=gta_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +gta_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1280, 720), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +train_gta = dict( + type=gta_type, + data_root=gta_root, + data_prefix=dict( + img_path="images", + seg_map_path="labels", + ), + img_suffix=".png", + seg_map_suffix="_labelTrainIds.png", + pipeline=gta_train_pipeline, +) +val_gta = dict( + type=gta_type, + data_root=gta_root, + data_prefix=dict( + img_path="images", + seg_map_path="labels", + ), + img_suffix=".png", + seg_map_suffix="_labelTrainIds.png", + pipeline=gta_test_pipeline, +) \ No newline at end of file diff --git a/configs/_base_/datasets/hrc_whu.py b/configs/_base_/datasets/hrc_whu.py new file mode 100644 index 0000000..e017a63 --- /dev/null +++ b/configs/_base_/datasets/hrc_whu.py @@ -0,0 +1,49 @@ +dataset_type = 'HRCWHUDataset' +data_root = 'data/hrc_whu' + +crop_size = (256, 256) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='RandomCrop', crop_size=crop_size), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=crop_size), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type='LoadAnnotations'), + dict(type='PackSegInputs') +] + +train_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='InfiniteSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/train', + seg_map_path='ann_dir/train'), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/test', + seg_map_path='ann_dir/test'), + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(type='IoUMetric', iou_metrics=["mIoU", "mDice", "mFscore"],) +test_evaluator = val_evaluator \ No newline at end of file diff --git a/configs/_base_/datasets/l8_biome.py b/configs/_base_/datasets/l8_biome.py new file mode 100644 index 0000000..7b02e9a --- /dev/null +++ b/configs/_base_/datasets/l8_biome.py @@ -0,0 +1,62 @@ +dataset_type = 'L8BIOMEDataset' +data_root = 'data/l8_biome' + +crop_size = (512, 512) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='RandomCrop', crop_size=crop_size), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=crop_size), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type='LoadAnnotations'), + dict(type='PackSegInputs') +] + +train_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='InfiniteSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/train', + seg_map_path='ann_dir/train'), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/test', + seg_map_path='ann_dir/test'), + pipeline=test_pipeline)) + +test_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='img_dir/test', + seg_map_path='ann_dir/test'), + pipeline=test_pipeline)) +# test_dataloader = val_dataloader + +val_evaluator = dict(type='IoUMetric', iou_metrics=["mIoU", "mDice", "mFscore"],) +test_evaluator = val_evaluator \ No newline at end of file diff --git a/configs/_base_/datasets/mapillary_512x512.py b/configs/_base_/datasets/mapillary_512x512.py new file mode 100644 index 0000000..d44398d --- /dev/null +++ b/configs/_base_/datasets/mapillary_512x512.py @@ -0,0 +1,42 @@ +mapillary_type = "CityscapesDataset" +mapillary_root = "data/mapillary/" +mapillary_crop_size = (512, 512) +mapillary_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict(type="Resize", scale=(1024, 512)), + dict(type="RandomCrop", crop_size=mapillary_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +mapillary_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1024, 512), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +train_mapillary = dict( + type=mapillary_type, + data_root=mapillary_root, + data_prefix=dict( + img_path="training/images", + seg_map_path="cityscapes_trainIdLabel/train/label", + ), + img_suffix=".jpg", + seg_map_suffix=".png", + pipeline=mapillary_train_pipeline, +) +val_mapillary = dict( + type=mapillary_type, + data_root=mapillary_root, + data_prefix=dict( + img_path="half/val_img", + seg_map_path="half/val_label", + ), + img_suffix=".jpg", + seg_map_suffix=".png", + pipeline=mapillary_test_pipeline, +) diff --git a/configs/_base_/datasets/night-acdc_1024x1024.py b/configs/_base_/datasets/night-acdc_1024x1024.py new file mode 100644 index 0000000..e9ab540 --- /dev/null +++ b/configs/_base_/datasets/night-acdc_1024x1024.py @@ -0,0 +1,65 @@ +night_acdc_type = "CityscapesDataset" +night_acdc_root = "data/acdc/" +night_acdc_crop_size = (1024, 1024) +night_acdc_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(1080 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size=night_acdc_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +night_acdc_val_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1920, 1080), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +night_acdc_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1920, 1080), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="PackSegInputs"), +] +train_night_acdc = dict( + type=night_acdc_type, + data_root=night_acdc_root, + data_prefix=dict( + img_path="rgb_anon/night/train", + seg_map_path="gt/night/train", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=night_acdc_train_pipeline, +) +val_night_acdc = dict( + type=night_acdc_type, + data_root=night_acdc_root, + data_prefix=dict( + img_path="rgb_anon/night/val", + seg_map_path="gt/night/val", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=night_acdc_val_pipeline, +) +test_night_acdc = dict( + type=night_acdc_type, + data_root=night_acdc_root, + data_prefix=dict( + img_path="rgb_anon/night/test", + seg_map_path="gt/night/test", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=night_acdc_test_pipeline, +) diff --git a/configs/_base_/datasets/night-acdc_512x512.py b/configs/_base_/datasets/night-acdc_512x512.py new file mode 100644 index 0000000..6fb0b48 --- /dev/null +++ b/configs/_base_/datasets/night-acdc_512x512.py @@ -0,0 +1,47 @@ +night_acdc_type = "CityscapesDataset" +night_acdc_root = "data/acdc/" +night_acdc_crop_size = (512, 512) +night_acdc_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(540 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size=night_acdc_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +night_acdc_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(960, 540), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +train_night_acdc = dict( + type=night_acdc_type, + data_root=night_acdc_root, + data_prefix=dict( + img_path="rgb_anon/night/train", + seg_map_path="gt/night/train", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=night_acdc_train_pipeline, +) +val_night_acdc = dict( + type=night_acdc_type, + data_root=night_acdc_root, + data_prefix=dict( + img_path="rgb_anon/night/val", + seg_map_path="gt/night/val", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=night_acdc_test_pipeline, +) diff --git a/configs/_base_/datasets/rain-acdc_1024x1024.py b/configs/_base_/datasets/rain-acdc_1024x1024.py new file mode 100644 index 0000000..6ff32bb --- /dev/null +++ b/configs/_base_/datasets/rain-acdc_1024x1024.py @@ -0,0 +1,66 @@ +rain_acdc_type = "CityscapesDataset" +rain_acdc_root = "data/acdc/" +rain_acdc_crop_size = (1024, 1024) +rain_acdc_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(1080 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size=rain_acdc_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +rain_acdc_val_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1920, 1080), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +rain_acdc_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1920, 1080), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="PackSegInputs"), +] +train_rain_acdc = dict( + type=rain_acdc_type, + data_root=rain_acdc_root, + data_prefix=dict( + img_path="rgb_anon/rain/train", + seg_map_path="gt/rain/train", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=rain_acdc_train_pipeline, +) +val_rain_acdc = dict( + type=rain_acdc_type, + data_root=rain_acdc_root, + data_prefix=dict( + img_path="rgb_anon/rain/val", + seg_map_path="gt/rain/val", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=rain_acdc_val_pipeline, +) +test_rain_acdc = dict( + type=rain_acdc_type, + data_root=rain_acdc_root, + data_prefix=dict( + img_path="rgb_anon/rain/test", + seg_map_path="gt/rain/test", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=rain_acdc_test_pipeline, +) + diff --git a/configs/_base_/datasets/rain-acdc_512x512.py b/configs/_base_/datasets/rain-acdc_512x512.py new file mode 100644 index 0000000..9129106 --- /dev/null +++ b/configs/_base_/datasets/rain-acdc_512x512.py @@ -0,0 +1,47 @@ +rain_acdc_type = "CityscapesDataset" +rain_acdc_root = "data/acdc/" +rain_acdc_crop_size = (512, 512) +rain_acdc_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(540 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size=rain_acdc_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +rain_acdc_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(960, 540), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +train_rain_acdc = dict( + type=rain_acdc_type, + data_root=rain_acdc_root, + data_prefix=dict( + img_path="rgb_anon/rain/train", + seg_map_path="gt/rain/train", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=rain_acdc_train_pipeline, +) +val_rain_acdc = dict( + type=rain_acdc_type, + data_root=rain_acdc_root, + data_prefix=dict( + img_path="rgb_anon/rain/val", + seg_map_path="gt/rain/val", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=rain_acdc_test_pipeline, +) diff --git a/configs/_base_/datasets/snow-acdc_1024x1024.py b/configs/_base_/datasets/snow-acdc_1024x1024.py new file mode 100644 index 0000000..3bb70bb --- /dev/null +++ b/configs/_base_/datasets/snow-acdc_1024x1024.py @@ -0,0 +1,65 @@ +snow_acdc_type = "CityscapesDataset" +snow_acdc_root = "data/acdc/" +snow_acdc_crop_size = (1024, 1024) +snow_acdc_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(1080 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size=snow_acdc_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +snow_acdc_val_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1920, 1080), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +snow_acdc_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1920, 1080), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="PackSegInputs"), +] +train_snow_acdc = dict( + type=snow_acdc_type, + data_root=snow_acdc_root, + data_prefix=dict( + img_path="rgb_anon/snow/train", + seg_map_path="gt/snow/train", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=snow_acdc_train_pipeline, +) +val_snow_acdc = dict( + type=snow_acdc_type, + data_root=snow_acdc_root, + data_prefix=dict( + img_path="rgb_anon/snow/val", + seg_map_path="gt/snow/val", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=snow_acdc_val_pipeline, +) +test_snow_acdc = dict( + type=snow_acdc_type, + data_root=snow_acdc_root, + data_prefix=dict( + img_path="rgb_anon/snow/test", + seg_map_path="gt/snow/test", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=snow_acdc_test_pipeline, +) \ No newline at end of file diff --git a/configs/_base_/datasets/snow-acdc_512x512.py b/configs/_base_/datasets/snow-acdc_512x512.py new file mode 100644 index 0000000..b89dc30 --- /dev/null +++ b/configs/_base_/datasets/snow-acdc_512x512.py @@ -0,0 +1,47 @@ +snow_acdc_type = "CityscapesDataset" +snow_acdc_root = "data/acdc/" +snow_acdc_crop_size = (512, 512) +snow_acdc_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(540 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size=snow_acdc_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +snow_acdc_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(960, 540), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +train_snow_acdc = dict( + type=snow_acdc_type, + data_root=snow_acdc_root, + data_prefix=dict( + img_path="rgb_anon/snow/train", + seg_map_path="gt/snow/train", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=snow_acdc_train_pipeline, +) +val_snow_acdc = dict( + type=snow_acdc_type, + data_root=snow_acdc_root, + data_prefix=dict( + img_path="rgb_anon/snow/val", + seg_map_path="gt/snow/val", + ), + img_suffix="_rgb_anon.png", + seg_map_suffix="_gt_labelTrainIds.png", + pipeline=snow_acdc_test_pipeline, +) diff --git a/configs/_base_/datasets/supervsied_cityscapes_512x512.py b/configs/_base_/datasets/supervsied_cityscapes_512x512.py new file mode 100644 index 0000000..1162f08 --- /dev/null +++ b/configs/_base_/datasets/supervsied_cityscapes_512x512.py @@ -0,0 +1,59 @@ +cityscapes_type = "CityscapesDataset" +cityscapes_root = "data/cityscapes/" +cityscapes_crop_size = (512, 512) +cityscapes_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict(type="Resize", scale=(1024, 512)), + dict(type="RandomCrop", crop_size=cityscapes_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +cityscapes_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1024, 512), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +train_cityscapes = dict( + type=cityscapes_type, + data_root=cityscapes_root, + data_prefix=dict( + img_path="leftImg8bit/train", + seg_map_path="gtFine/train", + ), + pipeline=cityscapes_train_pipeline, +) +train_dataloader=dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(type="InfiniteSampler", shuffle=True), + dataset=train_cityscapes, +) +val_cityscapes = dict( + type=cityscapes_type, + data_root=cityscapes_root, + data_prefix=dict( + img_path="leftImg8bit/val", + seg_map_path="gtFine/val", + ), + pipeline=cityscapes_test_pipeline, +) +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=val_cityscapes, +) +val_evaluator = dict( + type="IoUMetric", + iou_metrics=["mIoU"], +) +test_dataloader=val_dataloader +test_evaluator=val_evaluator diff --git a/configs/_base_/datasets/urbansyn_512x512.py b/configs/_base_/datasets/urbansyn_512x512.py new file mode 100644 index 0000000..6ee39b5 --- /dev/null +++ b/configs/_base_/datasets/urbansyn_512x512.py @@ -0,0 +1,56 @@ +urbansyn_type = "CityscapesDataset" +urbansyn_root = "data/UrbanSyn" +urbansyn_crop_size = (512, 512) +urbansyn_train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict(type="Resize", scale=(1024, 512)), + dict(type="RandomCrop", crop_size=urbansyn_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +urbansyn_train_pipeline_mask2former = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(512 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size=urbansyn_crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +urbansyn_test_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="Resize", scale=(1024, 512), keep_ratio=True), + # add loading annotation after ``Resize`` because ground truth + # does not need to do resize data transform + dict(type="LoadAnnotations"), + dict(type="PackSegInputs"), +] +train_urbansyn = dict( + type=urbansyn_type, + data_root=urbansyn_root, + data_prefix=dict( + img_path="rgb", + seg_map_path="ss", + ), + img_suffix=".png", + seg_map_suffix=".png", + pipeline=urbansyn_train_pipeline, +) +train_urbansyn_mask2former = dict( + type=urbansyn_type, + data_root=urbansyn_root, + data_prefix=dict( + img_path="rgb", + seg_map_path="ss", + ), + img_suffix=".png", + seg_map_suffix=".png", + pipeline=urbansyn_train_pipeline_mask2former, +) diff --git a/configs/_base_/datasets/urbansyn_synthia_gtav_to_citys.py b/configs/_base_/datasets/urbansyn_synthia_gtav_to_citys.py new file mode 100644 index 0000000..4729726 --- /dev/null +++ b/configs/_base_/datasets/urbansyn_synthia_gtav_to_citys.py @@ -0,0 +1,31 @@ +_base_ = [ + "./urbansyn_512x512.py", + "./synthia_512x512.py", + "./gta_512x512.py", + "./cityscapes_512x512.py", +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(type="InfiniteSampler", shuffle=True), + dataset=dict( + type="ConcatDataset", + datasets=[ + {{_base_.train_urbansyn}}, + {{_base_.train_gta}}, + {{_base_.train_syn}}, + ], + ), +) +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset={{_base_.val_cityscapes}}, +) +test_dataloader = val_dataloader +val_evaluator = dict(type="IoUMetric", iou_metrics=["mIoU"]) +test_evaluator = val_evaluator diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py new file mode 100644 index 0000000..e368340 --- /dev/null +++ b/configs/_base_/default_runtime.py @@ -0,0 +1,19 @@ +default_scope = "mmseg" +env_cfg = dict( + cudnn_benchmark=True, + mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0), + dist_cfg=dict(backend="nccl"), +) +vis_backends = [dict(type="LocalVisBackend"), dict(type="TensorboardVisBackend")] +visualizer = dict( + type="SegLocalVisualizer", vis_backends=vis_backends, name="visualizer" +) +log_processor = dict(by_epoch=False) +log_level = "INFO" +load_from = None +resume = False + +tta_model = dict(type="SegTTAModel") +randomness = dict( + seed=42, +) diff --git a/configs/_base_/models/clip-L_mask2former.py b/configs/_base_/models/clip-L_mask2former.py new file mode 100644 index 0000000..c103c4c --- /dev/null +++ b/configs/_base_/models/clip-L_mask2former.py @@ -0,0 +1,159 @@ +num_classes = 19 +norm_cfg = dict(type="SyncBN", requires_grad=True) +data_preprocessor = dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, +) +checkpoint_file = "checkpoints/ViT-L-14.pt" +# load from https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt +model = dict( + type="EncoderDecoder", + data_preprocessor=data_preprocessor, + pretrained=checkpoint_file, + backbone=dict( + type="CLIPVisionTransformer", + patch_size=16, + width=1024, + output_dim=512, + get_embeddings=False, + drop_path_rate=0.1, + layers=24, + input_resolution=512, + style="pytorch", + out_indices=[7, 11, 15, 23], + heads=16, + ), + neck=dict( + type="MultiLevelNeck", + in_channels=[1024, 1024, 1024, 1024], + out_channels=1024, + scales=[4, 2, 1, 0.5], + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole'), +) diff --git a/configs/_base_/models/cloud_adapter_dinov2.py b/configs/_base_/models/cloud_adapter_dinov2.py new file mode 100644 index 0000000..f36a3ae --- /dev/null +++ b/configs/_base_/models/cloud_adapter_dinov2.py @@ -0,0 +1,172 @@ +# crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="CloudAdapterDinoVisionTransformer", + + has_cat = False, + adapter_index=[0,6,12,18], + cloud_adapter_config=dict( + type="CloudAdapter", + cnn_type="pmaa", + int_type="pmaa", + emd_dim=1024, + num_layers=4, + context_dim=256, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=256, + depth=4, + local_groups=1, + global_groups=1, + ), + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing setting + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/cloud_adapter_dinov2_base.py b/configs/_base_/models/cloud_adapter_dinov2_base.py new file mode 100644 index 0000000..7dd2966 --- /dev/null +++ b/configs/_base_/models/cloud_adapter_dinov2_base.py @@ -0,0 +1,173 @@ +# crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="CloudAdapterDinoVisionTransformer", + + has_cat = False, + adapter_index=[0,6,12,18], + cloud_adapter_config=dict( + type="CloudAdapter", + cnn_type="pmaa", + int_type="pmaa", + emd_dim=768, + num_layers=4, + context_dim=256, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=256, + depth=4, + local_groups=1, + global_groups=1, + ), + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + out_indices=[2, 5, 8, 11], # refer to sam base + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_base_converted_512x512.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[768, 768, 768, 768], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing setting + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/cloud_adapter_dinov2_small.py b/configs/_base_/models/cloud_adapter_dinov2_small.py new file mode 100644 index 0000000..d0121ea --- /dev/null +++ b/configs/_base_/models/cloud_adapter_dinov2_small.py @@ -0,0 +1,173 @@ +# crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="CloudAdapterDinoVisionTransformer", + + has_cat = False, + adapter_index=[0,6,12,18], + cloud_adapter_config=dict( + type="CloudAdapter", + cnn_type="pmaa", + int_type="pmaa", + emd_dim=384, + num_layers=12, + context_dim=256, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=256, + depth=4, + local_groups=1, + global_groups=1, + ), + patch_size=16, + embed_dim=384, + depth=12, + num_heads=6, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + out_indices=[2, 5, 8, 11], # refer to sam base + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_s_converted_512x512.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[384, 384, 384, 384], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing setting + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/cloud_adapter_sam_base.py b/configs/_base_/models/cloud_adapter_sam_base.py new file mode 100644 index 0000000..59cd748 --- /dev/null +++ b/configs/_base_/models/cloud_adapter_sam_base.py @@ -0,0 +1,173 @@ +# model settings +# crop_size = (512, 512) +num_classes = 19 +norm_cfg = dict(type="SyncBN", requires_grad=True) +data_preprocessor = dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + size=(512, 512), +) +checkpoint_file = "checkpoints/sam_b_converted_512x512.pth" +model = dict( + type="EncoderDecoder", + data_preprocessor=data_preprocessor, + backbone=dict( + type="CloudAdapterSamVisionTransformer", + + has_cat = False, + adapter_index=[0,6,12,18], + cloud_adapter_config=dict( + type="CloudAdapter", + cnn_type="pmaa", + int_type="pmaa", + emd_dim=768, + num_layers=4, + context_dim=256, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=256, + depth=4, + local_groups=1, + global_groups=1, + ), + + img_size=512, + embed_dim=768, + depth=12, + num_heads=12, + global_attn_indexes=[2, 5, 8, 11], + out_indices=[2, 5, 8, 11], + window_size=14, + use_rel_pos=True, + init_cfg=dict( + type="Pretrained", + checkpoint=checkpoint_file, + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[768, 768, 768, 768], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), # yapf: disable + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole'), +) diff --git a/configs/_base_/models/cloud_adapter_sam_huge.py b/configs/_base_/models/cloud_adapter_sam_huge.py new file mode 100644 index 0000000..c992963 --- /dev/null +++ b/configs/_base_/models/cloud_adapter_sam_huge.py @@ -0,0 +1,173 @@ +# model settings +# crop_size = (512, 512) +num_classes = 19 +norm_cfg = dict(type="SyncBN", requires_grad=True) +data_preprocessor = dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + size=(512, 512), +) +checkpoint_file = "checkpoints/sam_h_converted_512x512.pth" +model = dict( + type="EncoderDecoder", + data_preprocessor=data_preprocessor, + backbone=dict( + type="CloudAdapterSamVisionTransformer", + + has_cat = False, + adapter_index=[0,6,12,18], + cloud_adapter_config=dict( + type="CloudAdapter", + cnn_type="pmaa", + int_type="pmaa", + emd_dim=1280, + num_layers=32, + context_dim=256, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=256, + depth=4, + local_groups=1, + global_groups=1, + ), + + img_size=512, + embed_dim=1280, + depth=32, + num_heads=16, + global_attn_indexes=[7, 15, 23, 31], + out_indices=[7, 15, 23, 31], + window_size=14, + use_rel_pos=True, + init_cfg=dict( + type="Pretrained", + checkpoint=checkpoint_file, + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1280, 1280, 1280, 1280], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), # yapf: disable + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole'), +) diff --git a/configs/_base_/models/cloud_adapter_sam_large.py b/configs/_base_/models/cloud_adapter_sam_large.py new file mode 100644 index 0000000..91a3101 --- /dev/null +++ b/configs/_base_/models/cloud_adapter_sam_large.py @@ -0,0 +1,173 @@ +# model settings +# crop_size = (512, 512) +num_classes = 19 +norm_cfg = dict(type="SyncBN", requires_grad=True) +data_preprocessor = dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + size=(512, 512), +) +checkpoint_file = "checkpoints/sam_vit_l_converted_512x512.pth" +model = dict( + type="EncoderDecoder", + data_preprocessor=data_preprocessor, + backbone=dict( + type="CloudAdapterSamVisionTransformer", + + has_cat = False, + adapter_index=[0,6,12,18], + cloud_adapter_config=dict( + type="CloudAdapter", + cnn_type="pmaa", + int_type="pmaa", + emd_dim=1024, + num_layers=12, + context_dim=256, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=256, + depth=4, + local_groups=1, + global_groups=1, + ), + + img_size=512, + embed_dim=1024, + depth=24, + num_heads=16, + global_attn_indexes=[5, 11, 17, 23], + out_indices=[5, 11, 17, 23], + window_size=14, + use_rel_pos=True, + init_cfg=dict( + type="Pretrained", + checkpoint=checkpoint_file, + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), # yapf: disable + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole'), +) diff --git a/configs/_base_/models/cnnadapter_dinov2.py b/configs/_base_/models/cnnadapter_dinov2.py new file mode 100644 index 0000000..a3d4814 --- /dev/null +++ b/configs/_base_/models/cnnadapter_dinov2.py @@ -0,0 +1,161 @@ +# crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="CNNAdapterDinoVisionTransformer", + cnnadapter_config=dict( + type="CNNAdapter", + emd_dim=1024, + num_layers=24, + context_dim=128, + ), + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing setting + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/convnext-L_mask2former.py b/configs/_base_/models/convnext-L_mask2former.py new file mode 100644 index 0000000..62b3b3d --- /dev/null +++ b/configs/_base_/models/convnext-L_mask2former.py @@ -0,0 +1,156 @@ +crop_size = (512, 512) +num_classes = 19 +norm_cfg = dict(type="SyncBN", requires_grad=True) +custom_imports = dict(imports="mmpretrain.models", allow_failed_imports=False) +checkpoint_file = "https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-large_3rdparty_in21k_20220301-e6e0ea0a.pth" # noqa +data_preprocessor = dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=crop_size, + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, +) +model = dict( + type="EncoderDecoder", + data_preprocessor=data_preprocessor, + pretrained=None, + backbone=dict( + type="mmpretrain.ConvNeXt", + arch="large", + out_indices=[0, 1, 2, 3], + drop_path_rate=0.4, + layer_scale_init_value=1.0, + gap_before_final_norm=False, + init_cfg=dict( + type="Pretrained", checkpoint=checkpoint_file, prefix="backbone." + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[192, 384, 768, 1536], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict( + mode="slide", + crop_size=(512, 512), + stride=(341, 341), + ), +) diff --git a/configs/_base_/models/convnext_dinov2_maskformer.py b/configs/_base_/models/convnext_dinov2_maskformer.py new file mode 100644 index 0000000..044125c --- /dev/null +++ b/configs/_base_/models/convnext_dinov2_maskformer.py @@ -0,0 +1,165 @@ +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="ConvnextDinoVisionTransformer", + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="low", # low or high + rank_scale=4, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/dinov2_b_mask2former.py b/configs/_base_/models/dinov2_b_mask2former.py new file mode 100644 index 0000000..106008a --- /dev/null +++ b/configs/_base_/models/dinov2_b_mask2former.py @@ -0,0 +1,156 @@ +# crop_size = (512, 512) +num_classes = 19 # +model = dict( + type="EncoderDecoder", + # data_preprocessor=dict( + # type="SegDataPreProcessor", + # mean=[123.675, 116.28, 103.53], + # std=[58.395, 57.12, 57.375], + # size=(512, 512), + # bgr_to_rgb=True, + # pad_val=0, + # seg_pad_val=255, + # ), + backbone=dict( + type="DinoVisionTransformer", + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + out_indices=[2, 5, 8, 11], # refer to sam base + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[768, 768, 768, 768], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/dinov2_l_mask2former.py b/configs/_base_/models/dinov2_l_mask2former.py new file mode 100644 index 0000000..7c8c3c0 --- /dev/null +++ b/configs/_base_/models/dinov2_l_mask2former.py @@ -0,0 +1,155 @@ +# crop_size = (512, 512) +num_classes = 19 # +model = dict( + type="EncoderDecoder", + # data_preprocessor=dict( + # type="SegDataPreProcessor", + # mean=[123.675, 116.28, 103.53], + # std=[58.395, 57.12, 57.375], + # size=(512, 512), + # bgr_to_rgb=True, + # pad_val=0, + # seg_pad_val=255, + # ), + backbone=dict( + type="DinoVisionTransformer", + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/dinov2_s_mask2former.py b/configs/_base_/models/dinov2_s_mask2former.py new file mode 100644 index 0000000..25b92cd --- /dev/null +++ b/configs/_base_/models/dinov2_s_mask2former.py @@ -0,0 +1,156 @@ +# crop_size = (512, 512) +num_classes = 19 # +model = dict( + type="EncoderDecoder", + # data_preprocessor=dict( + # type="SegDataPreProcessor", + # mean=[123.675, 116.28, 103.53], + # std=[58.395, 57.12, 57.375], + # size=(512, 512), + # bgr_to_rgb=True, + # pad_val=0, + # seg_pad_val=255, + # ), + backbone=dict( + type="DinoVisionTransformer", + patch_size=16, + embed_dim=384, + depth=12, + num_heads=6, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + out_indices=[2, 5, 8, 11], # refer to sam base + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[384, 384, 384, 384], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/eva02-L_mask2former.py b/configs/_base_/models/eva02-L_mask2former.py new file mode 100644 index 0000000..84d8d93 --- /dev/null +++ b/configs/_base_/models/eva02-L_mask2former.py @@ -0,0 +1,169 @@ +crop_size = (512, 512) +num_classes = 19 +norm_cfg = dict(type="SyncBN", requires_grad=True) +backbone_norm_cfg = dict(type="LN", requires_grad=True, eps=1e-6) +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=crop_size, + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="EVA2", + img_size=512, + patch_size=16, + in_chans=3, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4 * 2 / 3, # GLU default + out_indices=[7, 11, 15, 23], + qkv_bias=True, + drop_path_rate=0.2, + init_values=None, + use_checkpoint=False, + use_abs_pos_emb=True, + use_rel_pos_bias=False, + use_shared_rel_pos_bias=False, + rope=True, + pt_hw_seq_len=16, + intp_freq=True, + subln=True, + xattn=True, + naiveswiglu=True, + pretrained="checkpoints/eva02_L_converted.pth", + norm_layer=backbone_norm_cfg, + ), + decode_head=dict( + type="ReinMask2FormerHead", + replace_query_feat=True, + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict( + mode="slide", + crop_size=(512, 512), + stride=(341, 341), + ), +) diff --git a/configs/_base_/models/loracacheadapter_dinov2_mask2former.py b/configs/_base_/models/loracacheadapter_dinov2_mask2former.py new file mode 100644 index 0000000..8585e38 --- /dev/null +++ b/configs/_base_/models/loracacheadapter_dinov2_mask2former.py @@ -0,0 +1,162 @@ +# crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="LoRACacheAdapterDinoVisionTransformer", + loracacheadapter_config=dict( + type="LoRACacheAdapter", + emd_dim=1024, + num_layers=24, + rank_dim=8, + cache_dim=16, + ), + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing setting + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/my_rein_dinov2_mask2former.py b/configs/_base_/models/my_rein_dinov2_mask2former.py new file mode 100644 index 0000000..d0ba133 --- /dev/null +++ b/configs/_base_/models/my_rein_dinov2_mask2former.py @@ -0,0 +1,163 @@ +# crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="MyReinsDinoVisionTransformer", + reins_config=dict( + type="MyReins", + token_length=100, + embed_dims=1024, + num_layers=24, + patch_size=16, + link_token_to_query=True, + ), + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing setting + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/my_rein_token_mlp_dinov2_mask2former.py b/configs/_base_/models/my_rein_token_mlp_dinov2_mask2former.py new file mode 100644 index 0000000..ca87c83 --- /dev/null +++ b/configs/_base_/models/my_rein_token_mlp_dinov2_mask2former.py @@ -0,0 +1,163 @@ +# crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="MyReinTokenDinoVisionTransformer", + reins_config=dict( + type="MyReinsTokenMlp", + token_length=50, + embed_dims=1024, + num_layers=24, + patch_size=16, + link_token_to_query=True, + ), + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing setting + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/pmaa_adapter_dinov2.py b/configs/_base_/models/pmaa_adapter_dinov2.py new file mode 100644 index 0000000..6c4a88e --- /dev/null +++ b/configs/_base_/models/pmaa_adapter_dinov2.py @@ -0,0 +1,161 @@ +# crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="PMAAAdapterDinoVisionTransformer", + pmaa_adapter_config=dict( + type="PMAAAdapter", + emd_dim=1024, + num_layers=24, + context_dim=256 + ), + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing setting + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/rein_dinov2_mask2former.py b/configs/_base_/models/rein_dinov2_mask2former.py new file mode 100644 index 0000000..1d7b54b --- /dev/null +++ b/configs/_base_/models/rein_dinov2_mask2former.py @@ -0,0 +1,169 @@ +# crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="ReinsDinoVisionTransformer", + reins_config=dict( + type="LoRAReins", + token_length=100, + embed_dims=1024, + num_layers=24, + patch_size=16, + link_token_to_query=True, + lora_dim=16, + ), + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="ReinMask2FormerHead", + replace_query_feat=True, + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict( + mode="slide", + crop_size=(512, 512), + stride=(341, 341), + ), +) diff --git a/configs/_base_/models/rein_eva02-L_mask2former.py b/configs/_base_/models/rein_eva02-L_mask2former.py new file mode 100644 index 0000000..f64ff83 --- /dev/null +++ b/configs/_base_/models/rein_eva02-L_mask2former.py @@ -0,0 +1,178 @@ +crop_size = (512, 512) +num_classes = 19 +norm_cfg = dict(type="SyncBN", requires_grad=True) +backbone_norm_cfg = dict(type="LN", requires_grad=True, eps=1e-6) +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=crop_size, + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="ReinsEVA2", + reins_config=dict( + type="LoRAReins", + token_length=100, + embed_dims=1024, + num_layers=24, + patch_size=16, + link_token_to_query=True, + lora_dim=16, + ), + img_size=512, + patch_size=16, + in_chans=3, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4 * 2 / 3, # GLU default + out_indices=[7, 11, 15, 23], + qkv_bias=True, + drop_path_rate=0.2, + init_values=None, + use_checkpoint=False, + use_abs_pos_emb=True, + use_rel_pos_bias=False, + use_shared_rel_pos_bias=False, + rope=True, + pt_hw_seq_len=16, + intp_freq=True, + subln=True, + xattn=True, + naiveswiglu=True, + pretrained="checkpoints/eva02_L_converted.pth", + norm_layer=backbone_norm_cfg, + ), + decode_head=dict( + type="ReinMask2FormerHead", + replace_query_feat=True, + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict( + mode="slide", + crop_size=(512, 512), + stride=(341, 341), + ), +) diff --git a/configs/_base_/models/rein_resnet50_mask2former.py b/configs/_base_/models/rein_resnet50_mask2former.py new file mode 100644 index 0000000..b6b89c6 --- /dev/null +++ b/configs/_base_/models/rein_resnet50_mask2former.py @@ -0,0 +1,182 @@ +crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=crop_size, + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="ReinsResNetV1c", + reins_config=dict( + type="LoRAReins", + token_length=100, + patch_size=16, + link_token_to_query=True, + lora_dim=16, + ), + distinct_cfgs=( + dict( + num_layers=3, + embed_dims=256, + ), + dict( + num_layers=4, + embed_dims=512, + ), + dict( + num_layers=6, + embed_dims=1024, + ), + dict( + num_layers=3, + embed_dims=2048, + ), + ), + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 2, 4), + strides=(1, 2, 1, 1), + norm_cfg=dict(type="SyncBN", requires_grad=True), + norm_eval=False, + style="pytorch", + contract_dilation=True, + init_cfg=dict( + type="Pretrained", + checkpoint="open-mmlab://resnet50_v1c", + ), + ), + decode_head=dict( + type="ReinMask2FormerHead", + replace_query_feat=True, + in_channels=[256, 512, 1024, 2048], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict( + mode="slide", + crop_size=(512, 512), + stride=(341, 341), + ), +) diff --git a/configs/_base_/models/rein_tokens_dinov2_mask2former.py b/configs/_base_/models/rein_tokens_dinov2_mask2former.py new file mode 100644 index 0000000..f34580c --- /dev/null +++ b/configs/_base_/models/rein_tokens_dinov2_mask2former.py @@ -0,0 +1,163 @@ +# crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="ReinsTokenDinoVisionTransformer", + reins_config=dict( + type="MyReinsToken", + token_length=50, + embed_dims=1024, + num_layers=24, + patch_size=16, + link_token_to_query=True, + ), + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing setting + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/models/sam-vit-b_mask2former.py b/configs/_base_/models/sam-vit-b_mask2former.py new file mode 100644 index 0000000..c84f40e --- /dev/null +++ b/configs/_base_/models/sam-vit-b_mask2former.py @@ -0,0 +1,155 @@ +# model settings +# crop_size = (512, 512) +num_classes = 19 +norm_cfg = dict(type="SyncBN", requires_grad=True) +data_preprocessor = dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + size=(512, 512), +) +checkpoint_file = "checkpoints/sam_vit_b_converted_512x512.pth" +model = dict( + type="EncoderDecoder", + data_preprocessor=data_preprocessor, + backbone=dict( + type="SAMViT", + img_size=512, + embed_dim=768, + depth=12, + num_heads=12, + global_attn_indexes=[2, 5, 8, 11], + out_indices=[2, 5, 8, 11], + window_size=14, + use_rel_pos=True, + init_cfg=dict( + type="Pretrained", + checkpoint=checkpoint_file, + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[768, 768, 768, 768], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), # yapf: disable + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole'), +) diff --git a/configs/_base_/models/sam-vit-h_mask2former.py b/configs/_base_/models/sam-vit-h_mask2former.py new file mode 100644 index 0000000..585bea5 --- /dev/null +++ b/configs/_base_/models/sam-vit-h_mask2former.py @@ -0,0 +1,155 @@ +# model settings +# crop_size = (512, 512) +num_classes = 19 +norm_cfg = dict(type="SyncBN", requires_grad=True) +data_preprocessor = dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + size=(512, 512), +) +checkpoint_file = "checkpoints/sam_vit_h_converted_512x512.pth" +model = dict( + type="EncoderDecoder", + data_preprocessor=data_preprocessor, + backbone=dict( + type="SAMViT", + img_size=512, + embed_dim=1280, + depth=32, + num_heads=16, + global_attn_indexes=[7, 15, 23, 31], + out_indices=[7, 15, 23, 31], + window_size=14, + use_rel_pos=True, + init_cfg=dict( + type="Pretrained", + checkpoint=checkpoint_file, + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1280, 1280, 1280, 1280], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), # yapf: disable + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole'), +) diff --git a/configs/_base_/models/sam-vit-l_mask2former.py b/configs/_base_/models/sam-vit-l_mask2former.py new file mode 100644 index 0000000..542c63a --- /dev/null +++ b/configs/_base_/models/sam-vit-l_mask2former.py @@ -0,0 +1,154 @@ +# model settings +num_classes = 19 +norm_cfg = dict(type="SyncBN", requires_grad=True) +data_preprocessor = dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + size=(512, 512), +) +checkpoint_file = "checkpoints/sam_vit_l_converted_512x512.pth" +model = dict( + type="EncoderDecoder", + data_preprocessor=data_preprocessor, + backbone=dict( + type="SAMViT", + img_size=512, + embed_dim=1024, + depth=24, + num_heads=16, + global_attn_indexes=[5, 11, 17, 23], + out_indices=[5, 11, 17, 23], + window_size=14, + use_rel_pos=True, + init_cfg=dict( + type="Pretrained", + checkpoint=checkpoint_file, + ), + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), # yapf: disable + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole'), +) diff --git a/configs/_base_/models/vit_adapter_dinov2.py b/configs/_base_/models/vit_adapter_dinov2.py new file mode 100644 index 0000000..5ffac18 --- /dev/null +++ b/configs/_base_/models/vit_adapter_dinov2.py @@ -0,0 +1,158 @@ +# crop_size = (512, 512) +num_classes = 19 +model = dict( + type="EncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + type="ViTAdapter", + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + img_size=512, + ffn_layer="mlp", + init_values=1e-05, + block_chunks=0, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted.pth", + ), + + interaction_indexes=[[0, 5], [6, 11], [12, 17], [18, 23]], + deform_num_heads=16, + ), + decode_head=dict( + type="Mask2FormerHead", + in_channels=[1024, 1024, 1024, 1024], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type="mmdet.MSDeformAttnPixelDecoder", + num_outs=3, + norm_cfg=dict(type="GN", num_groups=32), + act_cfg=dict(type="ReLU"), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type="ReLU", inplace=True), + ), + ), + init_cfg=None, + ), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + init_cfg=None, + ), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True + ), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True, + ), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type="ReLU", inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True, + ), + ), + init_cfg=None, + ), + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + loss_mask=dict( + type="mmdet.CrossEntropyLoss", + use_sigmoid=True, + reduction="mean", + loss_weight=5.0, + ), + loss_dice=dict( + type="mmdet.DiceLoss", + use_sigmoid=True, + activate=True, + reduction="mean", + naive_dice=True, + eps=1.0, + loss_weight=5.0, + ), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type="mmdet.HungarianAssigner", + match_costs=[ + dict(type="mmdet.ClassificationCost", weight=2.0), + dict( + type="mmdet.CrossEntropyLossCost", weight=5.0, use_sigmoid=True + ), + dict(type="mmdet.DiceCost", weight=5.0, pred_act=True, eps=1.0), + ], + ), + sampler=dict(type="mmdet.MaskPseudoSampler"), + ), + ), + # model training and testing setting + train_cfg=dict(), + test_cfg=dict(mode='whole') +) diff --git a/configs/_base_/schedules/schedule_40k.py b/configs/_base_/schedules/schedule_40k.py new file mode 100644 index 0000000..8c2721a --- /dev/null +++ b/configs/_base_/schedules/schedule_40k.py @@ -0,0 +1,26 @@ +param_scheduler = [ + dict(type="LinearLR", start_factor=1e-6, by_epoch=False, begin=0, end=1000), + dict( + type="PolyLR", + eta_min=0.0, + power=0.9, + begin=1000, + end=40000, + by_epoch=False, + ), +] + +# training schedule for 40k +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=8000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", by_epoch=False, interval=4000, max_keep_ckpts=3 + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_adapter_all.py b/configs/adapter/cloud_adapter_adapter_all.py new file mode 100644 index 0000000..2d3b9c7 --- /dev/null +++ b/configs/adapter/cloud_adapter_adapter_all.py @@ -0,0 +1,87 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + num_layers=24, + attention_scale=1 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_cnn_convnext.py b/configs/adapter/cloud_adapter_cnn_convnext.py new file mode 100644 index 0000000..dd759e2 --- /dev/null +++ b/configs/adapter/cloud_adapter_cnn_convnext.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + cnn_type="convnext" + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_convnext_convnext_adapter_index_all.py b/configs/adapter/cloud_adapter_convnext_convnext_adapter_index_all.py new file mode 100644 index 0000000..6d12abd --- /dev/null +++ b/configs/adapter/cloud_adapter_convnext_convnext_adapter_index_all.py @@ -0,0 +1,96 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="convnext", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_convnext_convnext_base.py b/configs/adapter/cloud_adapter_convnext_convnext_base.py new file mode 100644 index 0000000..e9e1a7e --- /dev/null +++ b/configs/adapter/cloud_adapter_convnext_convnext_base.py @@ -0,0 +1,95 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + cnn_type="convnext", + int_type="convnext", + emd_dim=1024, + num_layers=4, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_convnext_convnext_cat.py b/configs/adapter/cloud_adapter_convnext_convnext_cat.py new file mode 100644 index 0000000..f4bdf2c --- /dev/null +++ b/configs/adapter/cloud_adapter_convnext_convnext_cat.py @@ -0,0 +1,98 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + has_cat=True, + cloud_adapter_config=dict( + cnn_type="convnext", + int_type="convnext", + emd_dim=1024, + num_layers=4, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + in_channels=[1024+64, 1024+64, 1024+64, 1024+64], + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, + begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_convnext_convnext_dim_128.py b/configs/adapter/cloud_adapter_convnext_convnext_dim_128.py new file mode 100644 index 0000000..87ee453 --- /dev/null +++ b/configs/adapter/cloud_adapter_convnext_convnext_dim_128.py @@ -0,0 +1,95 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + cnn_type="convnext", + int_type="convnext", + emd_dim=1024, + num_layers=4, + context_dim=128, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=128, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_convnext_convnext_last_feature.py b/configs/adapter/cloud_adapter_convnext_convnext_last_feature.py new file mode 100644 index 0000000..b674e18 --- /dev/null +++ b/configs/adapter/cloud_adapter_convnext_convnext_last_feature.py @@ -0,0 +1,95 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + cnn_type="convnext", + int_type="convnext", + emd_dim=1024, + num_layers=4, + context_dim=64, + return_multi_feats=False, + return_last_feature=True, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_convnext_convnext_lora_16_adapter_all.py b/configs/adapter/cloud_adapter_convnext_convnext_lora_16_adapter_all.py new file mode 100644 index 0000000..aeafceb --- /dev/null +++ b/configs/adapter/cloud_adapter_convnext_convnext_lora_16_adapter_all.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="convnext", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_convnext_convnext_multi_feature.py b/configs/adapter/cloud_adapter_convnext_convnext_multi_feature.py new file mode 100644 index 0000000..ad3e26c --- /dev/null +++ b/configs/adapter/cloud_adapter_convnext_convnext_multi_feature.py @@ -0,0 +1,95 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + cnn_type="convnext", + int_type="convnext", + emd_dim=1024, + num_layers=4, + context_dim=64, + return_multi_feats=True, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_dim_64.py b/configs/adapter/cloud_adapter_dim_64.py new file mode 100644 index 0000000..fbb2857 --- /dev/null +++ b/configs/adapter/cloud_adapter_dim_64.py @@ -0,0 +1,86 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + context_dim=64, + hidden_channels=64 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_int_convnext.py b/configs/adapter/cloud_adapter_int_convnext.py new file mode 100644 index 0000000..7d2b315 --- /dev/null +++ b/configs/adapter/cloud_adapter_int_convnext.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + int_type="convnext" + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_l1c.py b/configs/adapter/cloud_adapter_l1c.py new file mode 100644 index 0000000..394f182 --- /dev/null +++ b/configs/adapter/cloud_adapter_l1c.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict(), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_multi_feature.py b/configs/adapter/cloud_adapter_multi_feature.py new file mode 100644 index 0000000..4eec31e --- /dev/null +++ b/configs/adapter/cloud_adapter_multi_feature.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + return_multi_feats=True + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext.py b/configs/adapter/cloud_adapter_pmaa_convnext.py new file mode 100644 index 0000000..0a4e16a --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext.py @@ -0,0 +1,95 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=4, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_dim_16.py b/configs/adapter/cloud_adapter_pmaa_convnext_dim_16.py new file mode 100644 index 0000000..cbfe6b2 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_dim_16.py @@ -0,0 +1,95 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=4, + context_dim=16, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=16, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_dim_16_[7,11,15,23].py b/configs/adapter/cloud_adapter_pmaa_convnext_dim_16_[7,11,15,23].py new file mode 100644 index 0000000..efdf555 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_dim_16_[7,11,15,23].py @@ -0,0 +1,348 @@ +crop_size = ( + 512, + 512, +) +data_root = 'data/cloudsen12_high_l1c' +dataset_type = 'CLOUDSEN12HIGHL1CDataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = None +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 7, + 11, + 15, + 23, + ], + block_chunks=0, + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=512, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_512x512.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=16, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=16, + int_type='convnext', + local_groups=1, + num_layers=4, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 512, + 512, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=4, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 4 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/cloudsen12_high_l1c', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL1CDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict(max_iters=40000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/cloudsen12_high_l1c', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL1CDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/cloudsen12_high_l1c', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL1CDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_dim_16_[7,11,15,23]' diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_dim_32.py b/configs/adapter/cloud_adapter_pmaa_convnext_dim_32.py new file mode 100644 index 0000000..d6c2fbb --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_dim_32.py @@ -0,0 +1,95 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=4, + context_dim=32, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=32, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_dim_64_adapter_all.py b/configs/adapter/cloud_adapter_pmaa_convnext_dim_64_adapter_all.py new file mode 100644 index 0000000..74f2ad2 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_dim_64_adapter_all.py @@ -0,0 +1,96 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16.py new file mode 100644 index 0000000..383bebb --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16.py @@ -0,0 +1,96 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=4, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_12.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_12.py new file mode 100644 index 0000000..49c866e --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_12.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24) if i % 2 == 0], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=12, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_8.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_8.py new file mode 100644 index 0000000..a6147ad --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_8.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24) if i % 3 == 0], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=8, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all.py new file mode 100644 index 0000000..d280f39 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dim_128.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dim_128.py new file mode 100644 index 0000000..c6331af --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dim_128.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=128, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=128, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dim_16.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dim_16.py new file mode 100644 index 0000000..ebd02ab --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dim_16.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=16, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=16, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dim_32.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dim_32.py new file mode 100644 index 0000000..87e66fd --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dim_32.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=32, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=32, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dinov2_base.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dinov2_base.py new file mode 100644 index 0000000..454ae62 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dinov2_base.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2_base.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(12)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=768, + num_layers=12, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_b_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dinov2_small.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dinov2_small.py new file mode 100644 index 0000000..b900107 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_dinov2_small.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2_small.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(12)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=384, + num_layers=12, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_s_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf1.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf1.py new file mode 100644 index 0000000..6dacc6b --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf1.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf1.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=80000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=3, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf1_load_head.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf1_load_head.py new file mode 100644 index 0000000..edd8d8e --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf1_load_head.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf1.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf2.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf2.py new file mode 100644 index 0000000..a110ef7 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf2.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_has_cat.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_has_cat.py new file mode 100644 index 0000000..9bbbb0d --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_has_cat.py @@ -0,0 +1,99 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + has_cat=True, + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + in_channels=[1088,1088,1088,1088], + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_hrc_whu.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_hrc_whu.py new file mode 100644 index 0000000..6d0cf37 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_hrc_whu.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/hrc_whu.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l2a.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l2a.py new file mode 100644 index 0000000..a3ed339 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l2a.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l2a.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l8.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l8.py new file mode 100644 index 0000000..1827778 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l8.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/l8_biome.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=400000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head.py new file mode 100644 index 0000000..d39ae08 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/l8_biome.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head_40w.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head_40w.py new file mode 100644 index 0000000..1827778 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head_40w.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/l8_biome.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=400000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_last_feature.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_last_feature.py new file mode 100644 index 0000000..64afaa9 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_last_feature.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=True, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_multi_feature.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_multi_feature.py new file mode 100644 index 0000000..b95360a --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_multi_feature.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=True, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_no_adapting.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_no_adapting.py new file mode 100644 index 0000000..abff764 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_no_adapting.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="no_adapting", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_no_adapting_no_block.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_no_adapting_no_block.py new file mode 100644 index 0000000..3eb9be5 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_no_adapting_no_block.py @@ -0,0 +1,98 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="no_adapting", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16, + has_block=False + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_no_adapting_no_block_no_stem.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_no_adapting_no_block_no_stem.py new file mode 100644 index 0000000..9587597 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_no_adapting_no_block_no_stem.py @@ -0,0 +1,99 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="no_adapting", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16, + has_block=False, + has_stem=False, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_sam_base.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_sam_base.py new file mode 100644 index 0000000..9114366 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_sam_base.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_sam_base.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(12)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=768, + num_layers=12, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_b_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_sam_huge.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_sam_huge.py new file mode 100644 index 0000000..945750f --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_sam_huge.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_sam_huge.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(32)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1280, + num_layers=32, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_h_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_sam_large.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_sam_large.py new file mode 100644 index 0000000..8db22f3 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_16_adapter_all_sam_large.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_sam_large.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_32_adapter_all.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_32_adapter_all.py new file mode 100644 index 0000000..ad7934d --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_32_adapter_all.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=32 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_4_adapter_all.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_4_adapter_all.py new file mode 100644 index 0000000..455ed8b --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_4_adapter_all.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=4 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_lora_8_adapter_all.py b/configs/adapter/cloud_adapter_pmaa_convnext_lora_8_adapter_all.py new file mode 100644 index 0000000..20cd9a2 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_lora_8_adapter_all.py @@ -0,0 +1,97 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + adapter_index=[i for i in range(24)], + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=24, + context_dim=64, + return_multi_feats=False, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + rank_dim=8 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_convnext_multi_feature.py b/configs/adapter/cloud_adapter_pmaa_convnext_multi_feature.py new file mode 100644 index 0000000..8b3b116 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_convnext_multi_feature.py @@ -0,0 +1,95 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="convnext", + emd_dim=1024, + num_layers=4, + context_dim=64, + return_multi_feats=True, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cloud_adapter_pmaa_pmaa_multi_feature.py b/configs/adapter/cloud_adapter_pmaa_pmaa_multi_feature.py new file mode 100644 index 0000000..6260ce2 --- /dev/null +++ b/configs/adapter/cloud_adapter_pmaa_pmaa_multi_feature.py @@ -0,0 +1,95 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cloud_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cloud_adapter_config=dict( + cnn_type="pmaa", + int_type="pmaa", + emd_dim=1024, + num_layers=4, + context_dim=64, + return_multi_feats=True, + return_last_feature=False, + hidden_channels=64, + depth=4, + local_groups=1, + global_groups=1, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cnnadapter_dinov2_has_cat_l1c.py b/configs/adapter/cnnadapter_dinov2_has_cat_l1c.py new file mode 100644 index 0000000..e4e0e9e --- /dev/null +++ b/configs/adapter/cnnadapter_dinov2_has_cat_l1c.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cnnadapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + has_cat=True, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + in_channels=[1024 + 128, 1024 + 128, 1024 + 128, 1024 + 128], + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cnnadapter_dinov2_l1c.py b/configs/adapter/cnnadapter_dinov2_l1c.py new file mode 100644 index 0000000..a21b9e4 --- /dev/null +++ b/configs/adapter/cnnadapter_dinov2_l1c.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cnnadapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cnnadapter_config=dict(), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cnnadapter_dinov2_l1c_cross_4.py b/configs/adapter/cnnadapter_dinov2_l1c_cross_4.py new file mode 100644 index 0000000..6ebd84d --- /dev/null +++ b/configs/adapter/cnnadapter_dinov2_l1c_cross_4.py @@ -0,0 +1,86 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cnnadapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + cross_attention_count=4, + cnnadapter_config=dict( + num_layers=4 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/cnnadapter_dinov2_l1c_cross_4_has_cat.py b/configs/adapter/cnnadapter_dinov2_l1c_cross_4_has_cat.py new file mode 100644 index 0000000..28db933 --- /dev/null +++ b/configs/adapter/cnnadapter_dinov2_l1c_cross_4_has_cat.py @@ -0,0 +1,88 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/cnnadapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + has_cat=True, + cross_attention_count=4, + cnnadapter_config=dict( + num_layers=4 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + in_channels=[1024 + 128, 1024 + 128, 1024 + 128, 1024 + 128], + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_dinov2_froze_head_l_maskformer_gf2.py b/configs/adapter/convnext_dinov2_froze_head_l_maskformer_gf2.py new file mode 100644 index 0000000..ad52ecb --- /dev/null +++ b/configs/adapter/convnext_dinov2_froze_head_l_maskformer_gf2.py @@ -0,0 +1,87 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + type="FrozenHeadEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_l_gf2_head.pth", + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_dinov2_froze_l_maskformer_gf2.py b/configs/adapter/convnext_dinov2_froze_l_maskformer_gf2.py new file mode 100644 index 0000000..3d3c8ad --- /dev/null +++ b/configs/adapter/convnext_dinov2_froze_l_maskformer_gf2.py @@ -0,0 +1,82 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_high_gf2.py b/configs/adapter/convnext_high_gf2.py new file mode 100644 index 0000000..0d75acb --- /dev/null +++ b/configs/adapter/convnext_high_gf2.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="high", # low or high + rank_scale=4, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_high_l1c.py b/configs/adapter/convnext_high_l1c.py new file mode 100644 index 0000000..d449bf4 --- /dev/null +++ b/configs/adapter/convnext_high_l1c.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="high", # low or high + rank_scale=4, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_high_rank_1_gf2.py b/configs/adapter/convnext_high_rank_1_gf2.py new file mode 100644 index 0000000..cf51f42 --- /dev/null +++ b/configs/adapter/convnext_high_rank_1_gf2.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="high", # low or high + rank_scale=1, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_high_rank_2_gf2.py b/configs/adapter/convnext_high_rank_2_gf2.py new file mode 100644 index 0000000..14a0da1 --- /dev/null +++ b/configs/adapter/convnext_high_rank_2_gf2.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="high", # low or high + rank_scale=2, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_high_rank_8_gf2.py b/configs/adapter/convnext_high_rank_8_gf2.py new file mode 100644 index 0000000..a4da157 --- /dev/null +++ b/configs/adapter/convnext_high_rank_8_gf2.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="high", # low or high + rank_scale=8, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_low_gf2.py b/configs/adapter/convnext_low_gf2.py new file mode 100644 index 0000000..119ebf1 --- /dev/null +++ b/configs/adapter/convnext_low_gf2.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="low", # low or high + rank_scale=4, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_low_l1c.py b/configs/adapter/convnext_low_l1c.py new file mode 100644 index 0000000..6178754 --- /dev/null +++ b/configs/adapter/convnext_low_l1c.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="low", # low or high + rank_scale=4, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_low_rank_1_gf2.py b/configs/adapter/convnext_low_rank_1_gf2.py new file mode 100644 index 0000000..fac0655 --- /dev/null +++ b/configs/adapter/convnext_low_rank_1_gf2.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="low", # low or high + rank_scale=1, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_low_rank_8_gf2.py b/configs/adapter/convnext_low_rank_8_gf2.py new file mode 100644 index 0000000..4da9d90 --- /dev/null +++ b/configs/adapter/convnext_low_rank_8_gf2.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="low", # low or high + rank_scale=8, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_no_conv_gf2.py b/configs/adapter/convnext_no_conv_gf2.py new file mode 100644 index 0000000..9d9bf34 --- /dev/null +++ b/configs/adapter/convnext_no_conv_gf2.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="low", # low or high + rank_scale=4, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = False, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_no_conv_l1c.py b/configs/adapter/convnext_no_conv_l1c.py new file mode 100644 index 0000000..6e53bca --- /dev/null +++ b/configs/adapter/convnext_no_conv_l1c.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="low", # low or high + rank_scale=4, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = False, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_no_prog_gf2.py b/configs/adapter/convnext_no_prog_gf2.py new file mode 100644 index 0000000..63a34ca --- /dev/null +++ b/configs/adapter/convnext_no_prog_gf2.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="low", # low or high + rank_scale=4, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = False, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_no_prog_l1c.py b/configs/adapter/convnext_no_prog_l1c.py new file mode 100644 index 0000000..42a6024 --- /dev/null +++ b/configs/adapter/convnext_no_prog_l1c.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="low", # low or high + rank_scale=4, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = False, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_rank_2_gf2.py b/configs/adapter/convnext_rank_2_gf2.py new file mode 100644 index 0000000..ad948eb --- /dev/null +++ b/configs/adapter/convnext_rank_2_gf2.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="low", # low or high + rank_scale=2, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/convnext_rank_2_l1c.py b/configs/adapter/convnext_rank_2_l1c.py new file mode 100644 index 0000000..e3753fe --- /dev/null +++ b/configs/adapter/convnext_rank_2_l1c.py @@ -0,0 +1,93 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + convnext_config=dict( + type="AdapterConvNeXtBlock", + embed_dim=1024, + rank_type="low", # low or high + rank_scale=2, # 1, 2, 4, 8 + alpha = 1, # 1, 2, 4, 8 or nn.Parameter(data=torch.ones(embed_dim)) + act_layer = "silu", # nn.GELU or nn.SiLU + has_conv = True, + has_proj = True, + drop_prob=0, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/loracacheadapter_dinov2_l1c.py b/configs/adapter/loracacheadapter_dinov2_l1c.py new file mode 100644 index 0000000..3926a89 --- /dev/null +++ b/configs/adapter/loracacheadapter_dinov2_l1c.py @@ -0,0 +1,88 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/loracacheadapter_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + loracacheadapter_config=dict( + emd_dim=1024, + num_layers=24, + rank_dim=16, + cache_dim=256, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/loracacheadapter_dinov2_no_cnn_l1c.py b/configs/adapter/loracacheadapter_dinov2_no_cnn_l1c.py new file mode 100644 index 0000000..d124db5 --- /dev/null +++ b/configs/adapter/loracacheadapter_dinov2_no_cnn_l1c.py @@ -0,0 +1,91 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/loracacheadapter_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + loracacheadapter_config=dict( + emd_dim=1024, + num_layers=24, + rank_dim=16, + cache_dim=256, + + ####### + has_cnn=False + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein__token_mlp_scale8.py b/configs/adapter/my_rein__token_mlp_scale8.py new file mode 100644 index 0000000..ecec441 --- /dev/null +++ b/configs/adapter/my_rein__token_mlp_scale8.py @@ -0,0 +1,86 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_token_mlp_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + token_length=50, + mlp_scale=8 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_high_high.py b/configs/adapter/my_rein_dinov2_high_high.py new file mode 100644 index 0000000..83aff2d --- /dev/null +++ b/configs/adapter/my_rein_dinov2_high_high.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + high_high=True + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_l1c.py b/configs/adapter/my_rein_dinov2_l1c.py new file mode 100644 index 0000000..837a991 --- /dev/null +++ b/configs/adapter/my_rein_dinov2_l1c.py @@ -0,0 +1,82 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_low_low.py b/configs/adapter/my_rein_dinov2_low_low.py new file mode 100644 index 0000000..e0101d3 --- /dev/null +++ b/configs/adapter/my_rein_dinov2_low_low.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + low_low=True + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_low_low_mlp16.py b/configs/adapter/my_rein_dinov2_low_low_mlp16.py new file mode 100644 index 0000000..7cbbb14 --- /dev/null +++ b/configs/adapter/my_rein_dinov2_low_low_mlp16.py @@ -0,0 +1,87 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + low_low=True, + mlp_scale=16 + + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_low_low_mlp8.py b/configs/adapter/my_rein_dinov2_low_low_mlp8.py new file mode 100644 index 0000000..ccf6210 --- /dev/null +++ b/configs/adapter/my_rein_dinov2_low_low_mlp8.py @@ -0,0 +1,87 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + low_low=True, + mlp_scale=8 + + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_mlp__mlp_scale_8_conv.py b/configs/adapter/my_rein_dinov2_mlp__mlp_scale_8_conv.py new file mode 100644 index 0000000..4c05626 --- /dev/null +++ b/configs/adapter/my_rein_dinov2_mlp__mlp_scale_8_conv.py @@ -0,0 +1,86 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + is_conv=True, + mlp_scale=8 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_mlp_conv.py b/configs/adapter/my_rein_dinov2_mlp_conv.py new file mode 100644 index 0000000..0333bea --- /dev/null +++ b/configs/adapter/my_rein_dinov2_mlp_conv.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + is_conv=True + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_mlp_scale_16.py b/configs/adapter/my_rein_dinov2_mlp_scale_16.py new file mode 100644 index 0000000..d54f867 --- /dev/null +++ b/configs/adapter/my_rein_dinov2_mlp_scale_16.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + mlp_scale=16 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_mlp_scale_8.py b/configs/adapter/my_rein_dinov2_mlp_scale_8.py new file mode 100644 index 0000000..6ddad1d --- /dev/null +++ b/configs/adapter/my_rein_dinov2_mlp_scale_8.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + mlp_scale=8 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_no_depend.py b/configs/adapter/my_rein_dinov2_no_depend.py new file mode 100644 index 0000000..bd3e146 --- /dev/null +++ b/configs/adapter/my_rein_dinov2_no_depend.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + is_depend=False + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_no_shared.py b/configs/adapter/my_rein_dinov2_no_shared.py new file mode 100644 index 0000000..4ef5b59 --- /dev/null +++ b/configs/adapter/my_rein_dinov2_no_shared.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + is_share=False + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_dinov2_silu.py b/configs/adapter/my_rein_dinov2_silu.py new file mode 100644 index 0000000..c704273 --- /dev/null +++ b/configs/adapter/my_rein_dinov2_silu.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + activate="silu" + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_rank_16.py b/configs/adapter/my_rein_rank_16.py new file mode 100644 index 0000000..28f4773 --- /dev/null +++ b/configs/adapter/my_rein_rank_16.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_tokens_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + rank=16, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_rank_32.py b/configs/adapter/my_rein_rank_32.py new file mode 100644 index 0000000..ca671ae --- /dev/null +++ b/configs/adapter/my_rein_rank_32.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_tokens_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + rank=32, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_rank_64.py b/configs/adapter/my_rein_rank_64.py new file mode 100644 index 0000000..1679abf --- /dev/null +++ b/configs/adapter/my_rein_rank_64.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_tokens_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + rank=64, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_rank_8.py b/configs/adapter/my_rein_rank_8.py new file mode 100644 index 0000000..18f24b0 --- /dev/null +++ b/configs/adapter/my_rein_rank_8.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_tokens_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + rank=8, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_token_dinov2_l1c.py b/configs/adapter/my_rein_token_dinov2_l1c.py new file mode 100644 index 0000000..1742372 --- /dev/null +++ b/configs/adapter/my_rein_token_dinov2_l1c.py @@ -0,0 +1,82 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_tokens_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_token_dinov2_token_100.py b/configs/adapter/my_rein_token_dinov2_token_100.py new file mode 100644 index 0000000..74447e6 --- /dev/null +++ b/configs/adapter/my_rein_token_dinov2_token_100.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_tokens_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + token_length=100, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_token_dinov2_token_75.py b/configs/adapter/my_rein_token_dinov2_token_75.py new file mode 100644 index 0000000..039c451 --- /dev/null +++ b/configs/adapter/my_rein_token_dinov2_token_75.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_tokens_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + token_length=75, + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/my_rein_token_mlp.py b/configs/adapter/my_rein_token_mlp.py new file mode 100644 index 0000000..3cc3c54 --- /dev/null +++ b/configs/adapter/my_rein_token_mlp.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/my_rein_token_mlp_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + reins_config=dict( + token_length=50 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/pmaa_adapter_dinov2_l1c.py b/configs/adapter/pmaa_adapter_dinov2_l1c.py new file mode 100644 index 0000000..7e02ec8 --- /dev/null +++ b/configs/adapter/pmaa_adapter_dinov2_l1c.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/pmaa_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + pmaa_adapter_config=dict( + + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/pmaa_adapter_dinov2_l1c_context_64_local_32_global_2.py b/configs/adapter/pmaa_adapter_dinov2_l1c_context_64_local_32_global_2.py new file mode 100644 index 0000000..3423048 --- /dev/null +++ b/configs/adapter/pmaa_adapter_dinov2_l1c_context_64_local_32_global_2.py @@ -0,0 +1,87 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/pmaa_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + pmaa_adapter_config=dict( + context_dim=64, + local_groups=32, + global_groups=2 + ), + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/adapter/vit_adapter_dinov2_l1c.py b/configs/adapter/vit_adapter_dinov2_l1c.py new file mode 100644 index 0000000..8cec7b7 --- /dev/null +++ b/configs/adapter/vit_adapter_dinov2_l1c.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/vit_adapter_dinov2.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + pretrain_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/clip/head_clip_l_mask2former_hrc_whu.py b/configs/clip/head_clip_l_mask2former_hrc_whu.py new file mode 100644 index 0000000..8a44df7 --- /dev/null +++ b/configs/clip/head_clip_l_mask2former_hrc_whu.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/hrc_whu.py", + "../_base_/default_runtime.py", + "../_base_/models/clip-L_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256,256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/ViT-L-14.pt", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=1500, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=1500, val_interval=15) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=15, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=15, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/convnext/convnext-L_mask2former_512x512_bs1x4.py b/configs/convnext/convnext-L_mask2former_512x512_bs1x4.py new file mode 100644 index 0000000..662cf61 --- /dev/null +++ b/configs/convnext/convnext-L_mask2former_512x512_bs1x4.py @@ -0,0 +1,58 @@ +# dataset config +_base_ = [ + "../_base_/datasets/dg_gta_512x512.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext-L_mask2former.py" +] +train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(512 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size={{_base_.crop_size}}, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +train_dataloader = dict(batch_size=4, dataset=dict(pipeline=train_pipeline)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", by_epoch=False, interval=4000, max_keep_ckpts=3 + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/convnext/rein_convnext-L_mask2former_512x512_bs1x4.py b/configs/convnext/rein_convnext-L_mask2former_512x512_bs1x4.py new file mode 100644 index 0000000..2cce2af --- /dev/null +++ b/configs/convnext/rein_convnext-L_mask2former_512x512_bs1x4.py @@ -0,0 +1,39 @@ +_base_ = "./convnext-L_mask2former_512x512_bs1x4.py" +model = dict( + backbone=dict( + type="ReinsConvNeXt", + reins_config=dict( + type="LoRAReins", + token_length=100, + patch_size=16, + link_token_to_query=True, + lora_dim=16, + ), + distinct_cfgs=( + dict( + num_layers=3, + embed_dims=192, + ), + dict( + num_layers=3, + embed_dims=384, + ), + dict( + num_layers=27, + embed_dims=768, + ), + dict( + num_layers=3, + embed_dims=1536, + ), + ), + ), + decode_head=dict( + type="ReinMask2FormerHead", + replace_query_feat=True, + ), +) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", +) + diff --git a/configs/dinov2/convnext_dinov2_l_maskformer_gf1.py b/configs/dinov2/convnext_dinov2_l_maskformer_gf1.py new file mode 100644 index 0000000..33cd73b --- /dev/null +++ b/configs/dinov2/convnext_dinov2_l_maskformer_gf1.py @@ -0,0 +1,82 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf1.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/convnext_dinov2_l_maskformer_gf2.py b/configs/dinov2/convnext_dinov2_l_maskformer_gf2.py new file mode 100644 index 0000000..3d3c8ad --- /dev/null +++ b/configs/dinov2/convnext_dinov2_l_maskformer_gf2.py @@ -0,0 +1,82 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/convnext_dinov2_l_maskformer_hrc_whu.py b/configs/dinov2/convnext_dinov2_l_maskformer_hrc_whu.py new file mode 100644 index 0000000..f002903 --- /dev/null +++ b/configs/dinov2/convnext_dinov2_l_maskformer_hrc_whu.py @@ -0,0 +1,82 @@ +# dataset config +_base_ = [ + "../_base_/datasets/hrc_whu.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/convnext_dinov2_l_maskformer_l1c.py b/configs/dinov2/convnext_dinov2_l_maskformer_l1c.py new file mode 100644 index 0000000..a3481fb --- /dev/null +++ b/configs/dinov2/convnext_dinov2_l_maskformer_l1c.py @@ -0,0 +1,82 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/convnext_dinov2_l_maskformer_l2a.py b/configs/dinov2/convnext_dinov2_l_maskformer_l2a.py new file mode 100644 index 0000000..0b356bc --- /dev/null +++ b/configs/dinov2/convnext_dinov2_l_maskformer_l2a.py @@ -0,0 +1,82 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l2a.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/convnext_dinov2_l_maskformer_l8.py b/configs/dinov2/convnext_dinov2_l_maskformer_l8.py new file mode 100644 index 0000000..fe85672 --- /dev/null +++ b/configs/dinov2/convnext_dinov2_l_maskformer_l8.py @@ -0,0 +1,82 @@ +# dataset config +_base_ = [ + "../_base_/datasets/l8_biome.py", + "../_base_/default_runtime.py", + "../_base_/models/convnext_dinov2_maskformer.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/dinov2_mask2former_512x512_bs1x4.py b/configs/dinov2/dinov2_mask2former_512x512_bs1x4.py new file mode 100644 index 0000000..75536b9 --- /dev/null +++ b/configs/dinov2/dinov2_mask2former_512x512_bs1x4.py @@ -0,0 +1,59 @@ +# dataset config +_base_ = [ + "../_base_/datasets/dg_gta_512x512.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_mask2former.py" +] +train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(512 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size={{_base_.crop_size}}, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +train_dataloader = dict(batch_size=4, dataset=dict(pipeline=train_pipeline)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-08, + betas=(0.9, 0.999)), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1)), + norm_decay_mult=0.0) +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", by_epoch=False, interval=4000, max_keep_ckpts=3 + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/full_dinov2_l_mask2former_gf1.py b/configs/dinov2/full_dinov2_l_mask2former_gf1.py new file mode 100644 index 0000000..2e8c451 --- /dev/null +++ b/configs/dinov2/full_dinov2_l_mask2former_gf1.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf1.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/full_dinov2_l_mask2former_gf2.py b/configs/dinov2/full_dinov2_l_mask2former_gf2.py new file mode 100644 index 0000000..41b505a --- /dev/null +++ b/configs/dinov2/full_dinov2_l_mask2former_gf2.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/full_dinov2_l_mask2former_hrc_whu.py b/configs/dinov2/full_dinov2_l_mask2former_hrc_whu.py new file mode 100644 index 0000000..267ab46 --- /dev/null +++ b/configs/dinov2/full_dinov2_l_mask2former_hrc_whu.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/hrc_whu.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py" +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model=dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256,256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-08, + betas=(0.9, 0.999)), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1)), + norm_decay_mult=0.0) +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/dinov2/full_dinov2_l_mask2former_l1c.py b/configs/dinov2/full_dinov2_l_mask2former_l1c.py new file mode 100644 index 0000000..1dd6260 --- /dev/null +++ b/configs/dinov2/full_dinov2_l_mask2former_l1c.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/full_dinov2_l_mask2former_l2a.py b/configs/dinov2/full_dinov2_l_mask2former_l2a.py new file mode 100644 index 0000000..750ad70 --- /dev/null +++ b/configs/dinov2/full_dinov2_l_mask2former_l2a.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l2a.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/full_dinov2_l_mask2former_l8.py b/configs/dinov2/full_dinov2_l_mask2former_l8.py new file mode 100644 index 0000000..fa917c5 --- /dev/null +++ b/configs/dinov2/full_dinov2_l_mask2former_l8.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/l8_biome.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/dinov2/head_dinov2_b_mask2former_gf2.py b/configs/dinov2/head_dinov2_b_mask2former_gf2.py new file mode 100644 index 0000000..1564f27 --- /dev/null +++ b/configs/dinov2/head_dinov2_b_mask2former_gf2.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_b_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_b_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/dinov2/head_dinov2_l_mask2former_gf1.py b/configs/dinov2/head_dinov2_l_mask2former_gf1.py new file mode 100644 index 0000000..4d2de26 --- /dev/null +++ b/configs/dinov2/head_dinov2_l_mask2former_gf1.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf1.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/dinov2/head_dinov2_l_mask2former_gf2.py b/configs/dinov2/head_dinov2_l_mask2former_gf2.py new file mode 100644 index 0000000..6f42249 --- /dev/null +++ b/configs/dinov2/head_dinov2_l_mask2former_gf2.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/dinov2/head_dinov2_l_mask2former_hrc_whu.py b/configs/dinov2/head_dinov2_l_mask2former_hrc_whu.py new file mode 100644 index 0000000..2cf41f2 --- /dev/null +++ b/configs/dinov2/head_dinov2_l_mask2former_hrc_whu.py @@ -0,0 +1,92 @@ +''' +Author: JusperLee tsinghua.kaili@gmail.com +Date: 2024-10-21 15:35:04 +LastEditors: JusperLee tsinghua.kaili@gmail.com +LastEditTime: 2024-10-21 15:35:04 +FilePath: /rein/configs/dinov2/head_dinov2_l_mask2former_hrc_whu.py +Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE +''' +# dataset config +_base_ = [ + "../_base_/datasets/hrc_whu.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/head_dinov2_l_mask2former_l1c.py b/configs/dinov2/head_dinov2_l_mask2former_l1c.py new file mode 100644 index 0000000..8c89b85 --- /dev/null +++ b/configs/dinov2/head_dinov2_l_mask2former_l1c.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/dinov2/head_dinov2_l_mask2former_l2a.py b/configs/dinov2/head_dinov2_l_mask2former_l2a.py new file mode 100644 index 0000000..d2b2f99 --- /dev/null +++ b/configs/dinov2/head_dinov2_l_mask2former_l2a.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l2a.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/dinov2/head_dinov2_l_mask2former_l8.py b/configs/dinov2/head_dinov2_l_mask2former_l8.py new file mode 100644 index 0000000..a6a5160 --- /dev/null +++ b/configs/dinov2/head_dinov2_l_mask2former_l8.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/l8_biome.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/head_dinov2_s_mask2former_gf2.py b/configs/dinov2/head_dinov2_s_mask2former_gf2.py new file mode 100644 index 0000000..1e5ed13 --- /dev/null +++ b/configs/dinov2/head_dinov2_s_mask2former_gf2.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_s_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_s_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/dinov2/rein_dinov2_mask2former_1024x1024_bs4x2.py b/configs/dinov2/rein_dinov2_mask2former_1024x1024_bs4x2.py new file mode 100644 index 0000000..31f7a4a --- /dev/null +++ b/configs/dinov2/rein_dinov2_mask2former_1024x1024_bs4x2.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/dg_citys2acdc_1024x1024.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_dinov2_mask2former.py", +] +crop_size = (1024, 1024) +model = dict( + backbone=dict( + img_size=1024, + init_cfg=dict( + checkpoint="checkpoints/dinov2_converted_1024x1024.pth", + ), + ), + data_preprocessor=dict( + size=crop_size, + ), + test_cfg=dict( + crop_size=(1024, 1024), + stride=(683, 683), + ), +) +train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(1024 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=4096, + ), + dict(type="RandomCrop", crop_size=crop_size, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.00006, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="LinearLR", start_factor=1e-6, by_epoch=False, begin=0, end=10000), + dict( + type="PolyLR", + eta_min=0.0, + power=0.9, + begin=10000, + end=40000, + by_epoch=False, + ), +] +# training schedule for 160k +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", by_epoch=False, interval=4000, max_keep_ckpts=3 + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) +find_unused_parameters = True diff --git a/configs/dinov2/rein_dinov2_mask2former_512x512_bs1x4.py b/configs/dinov2/rein_dinov2_mask2former_512x512_bs1x4.py new file mode 100644 index 0000000..8b6fe80 --- /dev/null +++ b/configs/dinov2/rein_dinov2_mask2former_512x512_bs1x4.py @@ -0,0 +1,59 @@ +# dataset config +_base_ = [ + "../_base_/datasets/dg_gta_512x512.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_dinov2_mask2former.py" +] +train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(512 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size={{_base_.crop_size}}, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +train_dataloader = dict(batch_size=4, dataset=dict(pipeline=train_pipeline)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", by_epoch=False, interval=4000, max_keep_ckpts=3 + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/rein_dinov2_mask2former_gf2.py b/configs/dinov2/rein_dinov2_mask2former_gf2.py new file mode 100644 index 0000000..55d2010 --- /dev/null +++ b/configs/dinov2/rein_dinov2_mask2former_gf2.py @@ -0,0 +1,82 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_dinov2_mask2former.py" +] + +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/dinov2/rein_dinov2_mask2former_l1c.py b/configs/dinov2/rein_dinov2_mask2former_l1c.py new file mode 100644 index 0000000..deb8568 --- /dev/null +++ b/configs/dinov2/rein_dinov2_mask2former_l1c.py @@ -0,0 +1,82 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_dinov2_mask2former.py" +] + +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/dinov2_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/frozen_vfms/clip-L_mask2former.py b/configs/frozen_vfms/clip-L_mask2former.py new file mode 100644 index 0000000..bf9fc63 --- /dev/null +++ b/configs/frozen_vfms/clip-L_mask2former.py @@ -0,0 +1,62 @@ +# dataset config +_base_ = [ + "../_base_/datasets/dg_gta_512x512.py", + "../_base_/default_runtime.py", + "../_base_/models/clip-L_mask2former.py", +] +model = dict(type="FrozenBackboneEncoderDecoder") +train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(512 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size={{_base_.crop_size}}, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +train_dataloader = dict(batch_size=4, dataset=dict(pipeline=train_pipeline)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "query_embed": embed_multi, + "query_feat": embed_multi, + "level_embed": embed_multi, + "norm": dict(decay_mult=0.0), + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", by_epoch=False, interval=4000, max_keep_ckpts=3 + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) +''' +2024/02/05 07:45:55 - mmengine - INFO - Iter(val) [3500/3500] citys_aAcc: 89.0100 citys_mIoU: 51.4900 citys_mAcc: 67.1600 bdd_aAcc: 88.5600 bdd_mIoU: 49.5100 bdd_mAcc: 61.6800 map_aAcc: 89.5800 map_mIoU: 55.5500 map_mAcc: 69.1900 mean_aAcc: 89.0500 mean_mIoU: 52.1833 mean_mAcc: 66.0100 data_time: 0.0032 time: 0.3971 +''' \ No newline at end of file diff --git a/configs/frozen_vfms/dinov2-L_mask2former.py b/configs/frozen_vfms/dinov2-L_mask2former.py new file mode 100644 index 0000000..2b0d5b1 --- /dev/null +++ b/configs/frozen_vfms/dinov2-L_mask2former.py @@ -0,0 +1,62 @@ +# dataset config +_base_ = [ + "../_base_/datasets/dg_gta_512x512.py", + "../_base_/default_runtime.py", + "../_base_/models/dinov2_mask2former.py", +] +model = dict(type="FrozenBackboneEncoderDecoder") +train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(512 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size={{_base_.crop_size}}, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +train_dataloader = dict(batch_size=4, dataset=dict(pipeline=train_pipeline)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "query_embed": embed_multi, + "query_feat": embed_multi, + "level_embed": embed_multi, + "norm": dict(decay_mult=0.0), + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", by_epoch=False, interval=4000, max_keep_ckpts=3 + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) +''' +2024/02/05 07:45:55 - mmengine - INFO - Iter(val) [3500/3500] citys_aAcc: 89.0100 citys_mIoU: 51.4900 citys_mAcc: 67.1600 bdd_aAcc: 88.5600 bdd_mIoU: 49.5100 bdd_mAcc: 61.6800 map_aAcc: 89.5800 map_mIoU: 55.5500 map_mAcc: 69.1900 mean_aAcc: 89.0500 mean_mIoU: 52.1833 mean_mAcc: 66.0100 data_time: 0.0032 time: 0.3971 +''' diff --git a/configs/frozen_vfms/eva02-L_mask2former.py b/configs/frozen_vfms/eva02-L_mask2former.py new file mode 100644 index 0000000..4f38d86 --- /dev/null +++ b/configs/frozen_vfms/eva02-L_mask2former.py @@ -0,0 +1,59 @@ +# dataset config +_base_ = [ + "../_base_/datasets/dg_gta_512x512.py", + "../_base_/default_runtime.py", + "../_base_/models/eva02-L_mask2former.py", +] +model = dict(type="FrozenBackboneEncoderDecoder") +train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(512 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size={{_base_.crop_size}}, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +train_dataloader = dict(batch_size=4, dataset=dict(pipeline=train_pipeline)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "query_embed": embed_multi, + "query_feat": embed_multi, + "level_embed": embed_multi, + "norm": dict(decay_mult=0.0), + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", by_epoch=False, interval=4000, max_keep_ckpts=3 + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/frozen_vfms/sam-h_mask2former.py b/configs/frozen_vfms/sam-h_mask2former.py new file mode 100644 index 0000000..4833258 --- /dev/null +++ b/configs/frozen_vfms/sam-h_mask2former.py @@ -0,0 +1,59 @@ +# dataset config +_base_ = [ + "../_base_/datasets/dg_gta_512x512.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-h_mask2former.py", +] +model = dict(type="FrozenBackboneEncoderDecoder") +train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(512 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size={{_base_.crop_size}}, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +train_dataloader = dict(batch_size=4, dataset=dict(pipeline=train_pipeline)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "query_embed": embed_multi, + "query_feat": embed_multi, + "level_embed": embed_multi, + "norm": dict(decay_mult=0.0), + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", by_epoch=False, interval=4000, max_keep_ckpts=3 + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/resnet/rein_resnet50_mask2former_512x512_bs1x4.py b/configs/resnet/rein_resnet50_mask2former_512x512_bs1x4.py new file mode 100644 index 0000000..1036106 --- /dev/null +++ b/configs/resnet/rein_resnet50_mask2former_512x512_bs1x4.py @@ -0,0 +1,59 @@ +# dataset config +_base_ = [ + "../_base_/datasets/dg_gta_512x512.py", + "../_base_/default_runtime.py", + "../_base_/models/rein_resnet50_mask2former.py" +] +train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(512 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size={{_base_.crop_size}}, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +train_dataloader = dict(batch_size=4, dataset=dict(pipeline=train_pipeline)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", by_epoch=False, interval=4000, max_keep_ckpts=3 + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/sam/full_sam_l_mask2former_gf1.py b/configs/sam/full_sam_l_mask2former_gf1.py new file mode 100644 index 0000000..2e175f6 --- /dev/null +++ b/configs/sam/full_sam_l_mask2former_gf1.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf1.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/sam/full_sam_l_mask2former_gf2.py b/configs/sam/full_sam_l_mask2former_gf2.py new file mode 100644 index 0000000..815a063 --- /dev/null +++ b/configs/sam/full_sam_l_mask2former_gf2.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/sam/full_sam_l_mask2former_hrc_whu.py b/configs/sam/full_sam_l_mask2former_hrc_whu.py new file mode 100644 index 0000000..38d4bfa --- /dev/null +++ b/configs/sam/full_sam_l_mask2former_hrc_whu.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/hrc_whu.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py" +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model=dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256,256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-08, + betas=(0.9, 0.999)), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1)), + norm_decay_mult=0.0) +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/sam/full_sam_l_mask2former_l1c.py b/configs/sam/full_sam_l_mask2former_l1c.py new file mode 100644 index 0000000..ab4ea76 --- /dev/null +++ b/configs/sam/full_sam_l_mask2former_l1c.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/sam/full_sam_l_mask2former_l2a.py b/configs/sam/full_sam_l_mask2former_l2a.py new file mode 100644 index 0000000..3172e09 --- /dev/null +++ b/configs/sam/full_sam_l_mask2former_l2a.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l2a.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/sam/full_sam_l_mask2former_l8.py b/configs/sam/full_sam_l_mask2former_l8.py new file mode 100644 index 0000000..29faaf2 --- /dev/null +++ b/configs/sam/full_sam_l_mask2former_l8.py @@ -0,0 +1,83 @@ +# dataset config +_base_ = [ + "../_base_/datasets/l8_biome.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/sam/head_sam_b_mask2former_gf2.py b/configs/sam/head_sam_b_mask2former_gf2.py new file mode 100644 index 0000000..7f18e02 --- /dev/null +++ b/configs/sam/head_sam_b_mask2former_gf2.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-b_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_b_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/sam/head_sam_h_mask2former_gf2.py b/configs/sam/head_sam_h_mask2former_gf2.py new file mode 100644 index 0000000..6e1d269 --- /dev/null +++ b/configs/sam/head_sam_h_mask2former_gf2.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-h_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_h_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/sam/head_sam_l_mask2former_gf1.py b/configs/sam/head_sam_l_mask2former_gf1.py new file mode 100644 index 0000000..f597186 --- /dev/null +++ b/configs/sam/head_sam_l_mask2former_gf1.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf1.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/sam/head_sam_l_mask2former_gf2.py b/configs/sam/head_sam_l_mask2former_gf2.py new file mode 100644 index 0000000..6beb051 --- /dev/null +++ b/configs/sam/head_sam_l_mask2former_gf2.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/gf12ms_whu_gf2.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256, 256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/sam/head_sam_l_mask2former_hrc_whu.py b/configs/sam/head_sam_l_mask2former_hrc_whu.py new file mode 100644 index 0000000..ee82071 --- /dev/null +++ b/configs/sam/head_sam_l_mask2former_hrc_whu.py @@ -0,0 +1,85 @@ +# dataset config +_base_ = [ + "../_base_/datasets/hrc_whu.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py" +] +# model + +# crop_size = (256, 256) +num_classes = 2 + +model=dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(256,256), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=256, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_256x256.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-08, + betas=(0.9, 0.999)), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1)), + norm_decay_mult=0.0) +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) \ No newline at end of file diff --git a/configs/sam/head_sam_l_mask2former_l1c.py b/configs/sam/head_sam_l_mask2former_l1c.py new file mode 100644 index 0000000..bf417be --- /dev/null +++ b/configs/sam/head_sam_l_mask2former_l1c.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l1c.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/sam/head_sam_l_mask2former_l2a.py b/configs/sam/head_sam_l_mask2former_l2a.py new file mode 100644 index 0000000..102da34 --- /dev/null +++ b/configs/sam/head_sam_l_mask2former_l2a.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/cloudsen12_high_l2a.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/sam/head_sam_l_mask2former_l8.py b/configs/sam/head_sam_l_mask2former_l8.py new file mode 100644 index 0000000..5a64d80 --- /dev/null +++ b/configs/sam/head_sam_l_mask2former_l8.py @@ -0,0 +1,84 @@ +# dataset config +_base_ = [ + "../_base_/datasets/l8_biome.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-l_mask2former.py", +] +# model + +# crop_size = (256, 256) +num_classes = 4 + +model = dict( + type="FrozenBackboneEncoderDecoder", + data_preprocessor=dict( + type="SegDataPreProcessor", + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + size=(512, 512), + bgr_to_rgb=True, + pad_val=0, + seg_pad_val=255, + ), + backbone=dict( + img_size=512, + init_cfg=dict( + type="Pretrained", + checkpoint="checkpoints/sam_l_converted_512x512.pth", + ), + ), + decode_head=dict( + num_classes=num_classes, + loss_cls=dict( + type="mmdet.CrossEntropyLoss", # 解决类别不均衡 + use_sigmoid=False, + loss_weight=2.0, + reduction="mean", + class_weight=[1.0] * num_classes + [0.1], # [1, 1, 0.1] + ), + ), + test_cfg=dict(), +) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-08, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys=dict( + norm=dict(decay_mult=0.0), + query_embed=dict(lr_mult=1.0, decay_mult=0.0), + query_feat=dict(lr_mult=1.0, decay_mult=0.0), + level_embed=dict(lr_mult=1.0, decay_mult=0.0), + backbone=dict(lr_mult=0.1), + ), + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +# train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=4000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=4000, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + save_best=["mIoU"], + rule="greater", + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) diff --git a/configs/sam/rein_sam-h_mask2former_512x512_bs1x4.py b/configs/sam/rein_sam-h_mask2former_512x512_bs1x4.py new file mode 100644 index 0000000..3478e16 --- /dev/null +++ b/configs/sam/rein_sam-h_mask2former_512x512_bs1x4.py @@ -0,0 +1,79 @@ +# dataset config +_base_ = [ + "../_base_/datasets/dg_gta_512x512.py", + "../_base_/default_runtime.py", + "../_base_/models/sam-vit-h_mask2former.py", +] +model = dict( + backbone=dict( + type="ReinsSAMViT", + reins_config=dict( + type="LoRAReins", + token_length=100, + embed_dims=1280, + num_layers=4, + patch_size=16, + link_token_to_query=True, + lora_dim=16, + zero_mlp_delta_f=False, # v2 + ), + ), + decode_head=dict( + type="ReinMask2FormerHead", + ), +) +train_pipeline = [ + dict(type="LoadImageFromFile"), + dict(type="LoadAnnotations"), + dict( + type="RandomChoiceResize", + scales=[int(512 * x * 0.1) for x in range(5, 21)], + resize_type="ResizeShortestEdge", + max_size=2048, + ), + dict(type="RandomCrop", crop_size={{_base_.crop_size}}, cat_max_ratio=0.75), + dict(type="RandomFlip", prob=0.5), + dict(type="PhotoMetricDistortion"), + dict(type="PackSegInputs"), +] +train_dataloader = dict(batch_size=4, dataset=dict(pipeline=train_pipeline)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + constructor="PEFTOptimWrapperConstructor", + optimizer=dict( + type="AdamW", lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999) + ), + paramwise_cfg=dict( + custom_keys={ + "norm": dict(decay_mult=0.0), + "query_embed": embed_multi, + "level_embed": embed_multi, + "learnable_tokens": embed_multi, + "reins.scale": embed_multi, + }, + norm_decay_mult=0.0, + ), +) +param_scheduler = [ + dict(type="PolyLR", eta_min=0, power=0.9, begin=0, end=40000, by_epoch=False) +] + +# training schedule for 160k +train_cfg = dict(type="IterBasedTrainLoop", max_iters=40000, val_interval=10000) +val_cfg = dict(type="ValLoop") +test_cfg = dict(type="TestLoop") +default_hooks = dict( + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook", interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", by_epoch=False, interval=4000, max_keep_ckpts=3 + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + visualization=dict(type="SegVisualizationHook"), +) +find_unused_parameters = True +auto_scale_lr = dict(enable=False, base_batch_size=4) # v2 diff --git a/dataset/cloudsen12_high.py b/dataset/cloudsen12_high.py new file mode 100644 index 0000000..2d067e5 --- /dev/null +++ b/dataset/cloudsen12_high.py @@ -0,0 +1,174 @@ +import os +os.environ["NO_ALBUMENTATIONS_UPDATE"] = "1" +import albumentations as albu +import albumentations.pytorch +import numpy as np +from torch.utils.data import Dataset +import numpy as np +from typing import Literal, List, Tuple +import matplotlib.pyplot as plt +from utils.stretch import gaussian_stretch +import torch + + +class CloudSEN12High(Dataset): + METAINFO = dict( + classes=("clear", "thick cloud", "thin cloud", "cloud shadow"), + palette=( + (0, 0, 0), + (255, 255, 255), + (170, 170, 170), + (85, 85, 85), + ), + img_size=(512, 512), + ann_size=(512, 512), + train_size=8490, + val_size=535, + test_size=975, + ) + + def __init__( + self, + root: str = "data/cloudsen12_high", + phase: Literal["train", "val", "test"] = "train", + level: Literal["l1c", "l2a"] = "l1c", + bands: List[str] = ["B4", "B3", "B2"], + all_transform: albu.Compose = None, + img_transform: albu.Compose = None, + ann_transform: albu.Compose = None, + ): + self.root = root + self.phase = phase + self.level = level + self.bands = bands + self.all_transform = all_transform + self.img_transform = img_transform + self.ann_transform = ann_transform + self.image_data, self.label_data = self.load_data() + + def load_data(self): + image_data = self.__load_image() + label_data = self.__load_label() + return image_data, label_data + + def __load_label(self) -> np.ndarray: + label_path = os.path.join(self.root, self.phase, "LABEL_manual_hq.dat") + label_data = np.memmap(filename=label_path, dtype=np.int8, mode="r").reshape(-1, *self.METAINFO["ann_size"]) + label_data = label_data.astype(np.int64) + return label_data + + def __load_image(self) -> np.ndarray: + image_data = { + band: self.__load_image_by_band(band) for band in self.bands + } + return image_data + + def __load_image_by_band(self, band) -> Tuple[np.ndarray, np.ndarray]: + if "S1" in band: + image_path = os.path.join(self.root, self.phase, band) + ".dat" + dtype = np.float32 + else: + image_path = os.path.join(self.root, self.phase, "_".join([self.level.upper(), band])) + ".dat" + dtype = np.int16 + image = np.memmap(filename=image_path, dtype=dtype, mode="r").reshape(-1, *self.METAINFO["img_size"]) + return image + + def __len__(self) -> int: + return len(self.label_data) + + def __getitem__(self, idx): + img = np.stack([self.image_data[band][idx] for band in self.bands], axis=-1) + img = img.astype(np.float32) + img = (img - img.min()) / (img.max() - img.min() + 1e-6) # 1e-6 to avoid division by zero + ann = self.label_data[idx] + + if self.all_transform: + albumention = self.all_transform(image=img, mask=ann) + img = albumention["image"] + ann = albumention["mask"] + + if self.img_transform: + img = self.img_transform(image=img)["image"] + + if self.ann_transform: + ann = self.ann_transform(image=img)["image"] + + return { + "img": img, + "ann": ann, + } + + +def test_cloudsen12_high(): + for phase in ["train", "val", "test"]: + dataset = CloudSEN12High(phase=phase) + assert len(dataset)==CloudSEN12High.METAINFO[f"{phase}_size"] + assert dataset[0]["img"].shape == (*CloudSEN12High.METAINFO["ann_size"], len(["B4", "B3", "B2"])) + assert dataset[0]["ann"].shape == CloudSEN12High.METAINFO["ann_size"] + + +def show_cloudsen12_high(): + all_transform = albu.Compose([ + albu.PadIfNeeded(min_height=512, min_width=512, p=1, always_apply=True), + albu.OneOf([ + albu.HorizontalFlip(p=0.5), + albu.VerticalFlip(p=0.5), + albu.RandomRotate90(p=0.5), + albu.Transpose(p=0.5), + ], p=1), + albu.pytorch.transforms.ToTensorV2(), + ]) + + plt.figure(figsize=(16, 4)) + + + + for index in range(8490): + plt.subplot(1, 4, 1) + plt.title("L1C") + plt.axis("off") + seed_everything(42) + dataset = CloudSEN12High(phase="train", level="l1c", bands=["B4", "B3", "B2"], all_transform=all_transform) + data = dataset[index]["img"] + data = data.permute(1, 2, 0).numpy() + data = gaussian_stretch(data) + plt.imshow(data) + + plt.subplot(1, 4, 2) + plt.title("L2A") + plt.axis("off") + seed_everything(42) + dataset = CloudSEN12High(phase="train", level="l2a", bands=["B4", "B3", "B2"], all_transform=all_transform) + data = dataset[index]["img"] + data = data.permute(1, 2, 0).numpy() + data = gaussian_stretch(data) + plt.imshow(data) + + plt.subplot(1, 4, 3) + plt.title("SAR") + plt.axis("off") + seed_everything(42) + dataset = CloudSEN12High(phase="train", level="l1c", bands=["S1_VV", "S1_VH"], all_transform=all_transform) + data = dataset[index]["img"] + new_channel = (data[0] + data[1]) / 2 + data = torch.stack([data[0], data[1], new_channel]) + data = data.permute(1, 2, 0).numpy() + data = gaussian_stretch(data) + plt.imshow(data) + + plt.subplot(1, 4, 4) + plt.title("ANN") + plt.axis("off") + seed_everything(42) + dataset = CloudSEN12High(phase="train", level="l1c", bands=["B4", "B3", "B2"], all_transform=all_transform) + ann = dataset[index]["ann"] + color_map = np.array(CloudSEN12High.METAINFO["palette"]) + ann = color_map[ann] + plt.imshow(ann) + + plt.savefig("cloudsen12_high.png", bbox_inches="tight", pad_inches=0) + + +if __name__ == "__main__": + test_cloudsen12_high() + show_cloudsen12_high() diff --git a/dataset/gf12ms_whu.py b/dataset/gf12ms_whu.py new file mode 100644 index 0000000..4e62fcf --- /dev/null +++ b/dataset/gf12ms_whu.py @@ -0,0 +1,158 @@ +import os +from glob import glob +from typing import Literal, List + +import albumentations +import numpy as np +import tifffile as tf +from PIL import Image +from torch.utils.data import Dataset +import matplotlib.pyplot as plt +import albumentations.pytorch +from src.utils.stretch import gaussian_stretch + + +class GF12MSWHU(Dataset): + METAINFO = dict( + classes=("clear", "cloud"), + palette=((0, 0, 0), (255, 255, 255)), + img_size=(250, 250), + ann_size=(250, 250), + ) + + def __init__( + self, + root: str = "data/gf12ms_whu", + phase: Literal["train", "val", "test"] = "train", + serial: Literal["gf1", "gf2", "all"] = "all", + bands: List[str] = ["B3", "B2", "B1"], # only B1, B2, B3, B4 are available, B4 is nir, and B3, B2, B1 are rgb + all_transform: albumentations.Compose = None, + img_transform: albumentations.Compose = None, + ann_transform: albumentations.Compose = None, + ) -> None: + super().__init__() + self.image_paths, self.mask_paths = self.__load_data(root, phase,serial) + self.bands = bands + self.all_transform = all_transform + self.img_transform = img_transform + self.ann_transform = ann_transform + + def __load_data(self, root:str, phase:str, serial:str): + if phase == "train": + filename = "TrainBlock250" + elif phase == "val" or phase == "test": + filename = "TestBlock250" + else: + raise ValueError( + "phase must be one of 'train','val','test', but got {}".format(phase) + ) + + if serial == "all": + serial = "**" + elif serial == "gf1": + serial = "GF1MS-WHU" + elif serial == "gf2": + serial = "GF2MS-WHU" + else: + raise ValueError("serial must be one of 'gf1','gf2','all', but got {}".format(serial)) + + mask_paths = glob(os.path.join(root, serial, filename, "*.tif")) + image_paths = [ + filename.replace("_Mask", "").replace("tif", "tiff") + for filename in mask_paths + ] + return image_paths, mask_paths + + def __len__(self): + return len(self.image_paths) + + def __getitem__(self, idx): + image_path = self.image_paths[idx] + mask_path = self.mask_paths[idx] + + image = tf.imread(image_path).transpose(1, 2, 0) # (C, H, W) -> (H, W, C) + + # bands + if len(self.bands)>4: + raise ValueError("The number of bands must be less than 4") + else: + tmp = np.zeros((image.shape[0], image.shape[1], len(self.bands)), dtype=np.float32) + for i, band in enumerate(self.bands): + if band == "B1": + tmp[:,:,i] = image[:,:,0] + elif band == "B2": + tmp[:,:,i] = image[:,:,1] + elif band == "B3": + tmp[:,:,i] = image[:,:,2] + elif band == "B4": + tmp[:,:,i] = image[:,:,3] + else: + raise ValueError("The band must be one of 'B1','B2','B3','B4', but got {}".format(band)) + image = tmp + + image = (image - image.min()) / (image.max() - image.min() + 1e-6) # normalize + + mask = np.array(Image.open(mask_path)) # (H, W) + + if self.all_transform: + transformed = self.all_transform(image=image, mask=mask) + image = transformed["image"] + mask = transformed["mask"] + if self.img_transform: + image = self.img_transform(image=image)["image"] + if self.ann_transform: + mask = self.ann_transform(image=mask)["image"] + + return {"img": image, "ann": np.int64(mask), "img_path": image_path} + + +def show_gf12ms_whu(): + all_transform = albumentations.Compose([ + albumentations.OneOf([ + albumentations.HorizontalFlip(p=0.5), + albumentations.VerticalFlip(p=0.5), + albumentations.RandomRotate90(p=0.5), + albumentations.Transpose(p=0.5), + ], p=1), + ]) + + img_transform = albumentations.Compose([ + albumentations.pytorch.ToTensorV2(), + ]) + + gf1_train_dataset = GF12MSWHU(phase="train", serial="gf1", all_transform=all_transform, img_transform=img_transform) + + gf2_train_dataset = GF12MSWHU(phase="train", serial="gf2", all_transform=all_transform, img_transform=img_transform) + + for gf1, gf2 in zip(gf1_train_dataset, gf2_train_dataset): + plt.figure(figsize=(12, 12)) + + plt.subplot(2, 2, 1) + img = gf1["img"].permute(1, 2, 0) + img = (img*255).numpy().astype(np.uint8) + plt.imshow(gaussian_stretch(img)) + plt.title("GF1_img") + plt.axis("off") + + plt.subplot(2, 2, 2) + plt.imshow(gf1["ann"]) + plt.title("GF1_ann") + plt.axis("off") + + plt.subplot(2, 2, 3) + img = gf2["img"].permute(1, 2, 0) + img = (img*255).numpy().astype(np.uint8) + plt.imshow(gaussian_stretch(img)) + plt.title("GF2_img") + plt.axis("off") + + plt.subplot(2, 2, 4) + plt.imshow(gf2["ann"]) + plt.title("GF2_ann") + plt.axis("off") + + plt.savefig("gf12ms_whu.png", bbox_inches="tight", pad_inches=0) + + +if __name__ == "__main__": + show_gf12ms_whu() diff --git a/dataset/hrc_whu.py b/dataset/hrc_whu.py new file mode 100644 index 0000000..b3aa80f --- /dev/null +++ b/dataset/hrc_whu.py @@ -0,0 +1,124 @@ +import os + +import albumentations +import numpy as np +from PIL import Image +from torch.utils.data import Dataset + + +class HRC_WHU(Dataset): + METAINFO = dict( + classes=('clear sky', 'cloud'), + palette=((0, 0, 0),(255, 255, 255)), + img_size=(3, 256, 256), # C, H, W + ann_size=(256, 256), # C, H, W + train_size=120, + test_size=30, + ) + + def __init__( + self, + root: str = "data/hrc_whu", + phase: str = "train", + all_transform: albumentations.Compose = None, + img_transform: albumentations.Compose = None, + ann_transform: albumentations.Compose = None + ): + self.root = root + self.phase = phase + self.all_transform = all_transform + self.img_transform = img_transform + self.ann_transform = ann_transform + self.data = self.load_data() + + + def load_data(self): + data = [] + split = 'train' if self.phase == 'train' else 'test' + split_file = os.path.join(self.root, f'{split}.txt') + with open(split_file, 'r') as f: + for line in f: + image_file = line.strip() + img_path = os.path.join(self.root, 'img_dir', split, image_file) + ann_path = os.path.join(self.root, 'ann_dir', split, image_file) + lac_type = image_file.split('_')[0] + data.append((img_path, ann_path, lac_type)) + return data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + img_path, ann_path, lac_type = self.data[idx] + img = Image.open(img_path) + ann = Image.open(ann_path) + + img = np.array(img) + ann = np.array(ann) + + if self.all_transform: + albumention = self.all_transform(image=img, mask=ann) + img = albumention['image'] + ann = albumention['mask'] + + if self.img_transform: + img = self.img_transform(image=img)['image'] + + if self.ann_transform: + ann = self.ann_transform(image=img)['image'] + + return { + 'img': img, + 'ann': np.int64(ann), + 'img_path': img_path, + 'ann_path': ann_path, + 'lac_type': lac_type, + } + + +if __name__ == '__main__': + import torchvision.transforms as transforms + import torch + + # all_transform = transforms.Compose([ + # transforms.RandomCrop((256, 256)), + # ]) + all_transform = transforms.RandomCrop((256, 256)) + + # img_transform = transforms.Compose([ + # transforms.ToTensor(), + # ]) + + img_transform = transforms.ToTensor() + + # ann_transform = transforms.Compose([ + # transforms.PILToTensor(), + # ]) + ann_transform = transforms.PILToTensor() + + train_dataset = HRC_WHU(root='data/hrc_whu', phase='train', all_transform=all_transform, img_transform=img_transform, + ann_transform=ann_transform) + test_dataset = HRC_WHU(root='data/hrc_whu', phase='test', all_transform=all_transform, img_transform=img_transform, + ann_transform=ann_transform) + + assert len(train_dataset) == train_dataset.METAINFO['train_size'] + assert len(test_dataset) == test_dataset.METAINFO['test_size'] + + train_sample = train_dataset[0] + test_sample = test_dataset[0] + + assert train_sample['img'].shape == test_sample['img'].shape == train_dataset.METAINFO['img_size'] + assert train_sample['ann'].shape == test_sample['ann'].shape == train_dataset.METAINFO['ann_size'] + + import matplotlib.pyplot as plt + + fig, axs = plt.subplots(1, 2, figsize=(10, 5)) + for train_sample in train_dataset: + axs[0].imshow(train_sample['img'].permute(1, 2, 0)) + axs[0].set_title('Image') + axs[1].imshow(torch.tensor(train_dataset.METAINFO['palette'])[train_sample['ann']]) + axs[1].set_title('Annotation') + plt.suptitle(f'Land Cover Type: {train_sample["lac_type"].capitalize()}', y=0.8) + plt.tight_layout() + plt.savefig('HRC_WHU_sample.png', bbox_inches="tight") + # break diff --git a/dataset/l8_biome.py b/dataset/l8_biome.py new file mode 100644 index 0000000..30de4d1 --- /dev/null +++ b/dataset/l8_biome.py @@ -0,0 +1,412 @@ +import os +import pathlib +import albumentations +import numpy as np +from glob import glob +from PIL import Image +from torch.utils.data import Dataset +from natsort import natsorted +from typing import List, Literal +from torchgeo.datasets import RasterDataset as RasterDatasetBase +from torchgeo.datasets import IntersectionDataset +from torchgeo.datasets import L8Biome as L8BiomeBase +from typing import Any, cast, Sequence, Callable, ClassVar, Iterable +import torch +from torch import Tensor +import re +import glob +from rasterio.crs import CRS +from matplotlib.figure import Figure +import sys +import matplotlib.pyplot as plt +from torchgeo.datasets.errors import DatasetNotFoundError, RGBBandsMissingError +from torchgeo.datasets.utils import BoundingBox, Path, download_url, extract_archive + + +class RasterDataset(RasterDatasetBase): + """RasterDataset with added landcover information in the sample.""" + + def __init__( + self, + paths: Path | Iterable[Path] = "data", + crs: CRS | None = None, + res: float | None = None, + bands: Sequence[str] | None = ("B4", "B3", "B2"), + all_transform=None, + img_transform=None, + ann_transform=None, + cache: bool = True, + ): + super().__init__(paths=paths, crs=crs, res=res, bands=bands, transforms=None, cache=cache) + self.all_transform = all_transform + self.img_transform = img_transform + self.ann_transform = ann_transform + + def __getitem__(self, query: BoundingBox) -> dict[str, Any]: + """Retrieve image/mask, metadata, and landcover information indexed by query. + + Args: + query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index + + Returns: + sample of image/mask, metadata, and landcover information at that index + + Raises: + IndexError: if query is not found in the index + """ + hits = self.index.intersection(tuple(query), objects=True) + filepaths = cast(list[Path], [hit.object for hit in hits]) + + if not filepaths: + raise IndexError( + f"query: {query} not found in index with bounds: {self.bounds}" + ) + + if self.separate_files: + data_list: list[Tensor] = [] + filename_regex = re.compile(self.filename_regex, re.VERBOSE) + for band in self.bands: + band_filepaths = [] + for filepath in filepaths: + filename = os.path.basename(filepath) + directory = os.path.dirname(filepath) + match = re.match(filename_regex, filename) + if match: + if "band" in match.groupdict(): + start = match.start("band") + end = match.end("band") + filename = filename[:start] + band + filename[end:] + filepath = os.path.join(directory, filename) + band_filepaths.append(filepath) + data_list.append(self._merge_files(band_filepaths, query)) + data = torch.cat(data_list) + else: + data = self._merge_files(filepaths, query, self.band_indexes) + + # Get landcover value from the file path (third last folder) + landcover = filepaths[0].split(os.path.sep)[-3] + + if landcover == "grass_crops": + landcover = "Grass/Crops" + elif landcover == "snow_ice": + landcover = "Snow/Ice" + else: + landcover = landcover.capitalize() + + sample = {"crs": self.crs, "bounds": query, "landcover": landcover} + + data = data.to(self.dtype) + if self.is_image: + sample["image"] = data + else: + sample["mask"] = data.squeeze(0) + + if self.all_transform is not None: + transforms = self.all_transform(image=sample["image"], mask=sample["mask"]) + sample["image"], sample["mask"] = transforms["image"], transforms["mask"] + + if self.img_transform is not None: + sample["image"] = self.img_transform(image=sample["image"])["image"] + + if self.ann_transform is not None: + sample["mask"] = self.img_transform(image=sample["mask"])["image"] + + if self.is_image: + sample["img"] = sample["image"] + else: + sample["ann"] = sample["mask"] + sample["ldc"] = sample["landcover"] + + return sample + + +class L8BiomeImage(RasterDataset): + """Images from the L8 Biome dataset.""" + + # https://gisgeography.com/landsat-file-naming-convention/ + filename_glob = "LC8*.TIF" + filename_regex = r""" + ^LC8 + (?P\d{3}) + (?P\d{3}) + (?P\d{7}) + (?P[A-Z]{3}) + (?P\d{2}) + \.TIF$ + """ + date_format = "%Y%j" + is_image = True + rgb_bands = ("B4", "B3", "B2") + all_bands = ("B1", "B2", "B3", "B4", "B5", "B6", "B7", "B8", "B9", "B10", "B11") + + +class L8BiomeMask(RasterDataset): + """Masks from the L8 Biome dataset.""" + + # https://gisgeography.com/landsat-file-naming-convention/ + filename_glob = "LC8*_fixedmask.TIF" + filename_regex = r""" + ^LC8 + (?P\d{3}) + (?P\d{3}) + (?P\d{7}) + (?P[A-Z]{3}) + (?P\d{2}) + _fixedmask + \.TIF$ + """ + date_format = "%Y%j" + is_image = False + classes = ("Fill", "Cloud Shadow", "Clear", "Thin Cloud", "Cloud") + ordinal_map = torch.zeros(256, dtype=torch.long) + ordinal_map[64] = 1 + ordinal_map[128] = 0 # Fill is respected as clear + ordinal_map[192] = 2 + ordinal_map[255] = 3 + + def __getitem__(self, query: BoundingBox) -> dict[str, Any]: + """Retrieve image/mask and metadata indexed by query. + + Args: + query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index + Returns: + sample of image, mask and metadata at that index + Raises: + IndexError: if query is not found in the index + """ + sample = super().__getitem__(query) + sample["mask"] = self.ordinal_map[sample["mask"]] + sample["ann"] = self.ordinal_map[sample["ann"]] + return sample + + +class L8Biome(IntersectionDataset): + """L8 Biome dataset. + + The `L8 Biome `__ + dataset is a validation dataset for cloud cover assessment algorithms, consisting + of Pre-Collection Landsat 8 Operational Land Imager (OLI) Thermal Infrared Sensor + (TIRS) terrain-corrected (Level-1T) scenes. + + Dataset features: + + * Images evenly divided between 8 unique biomes + * 96 scenes from Landsat 8 OLI/TIRS sensors + * Imagery from global tiles between April 2013--October 2014 + * 11 Level-1 spectral bands with 30 m per pixel resolution + + Dataset format: + + * Images are composed of single multiband geotiffs + * Labels are multiclass, stored in single geotiffs + * Quality assurance bands, stored in single geotiffs + * Level-1 metadata (MTL.txt file) + * Landsat 8 OLI/TIRS bands: (B1, B2, B3, B4, B5, B6, B7, B8, B9, B10, B11) + + Dataset classes: + + 0. Fill + 1. Cloud Shadow + 2. Clear + 3. Thin Cloud + 4. Cloud + + If you use this dataset in your research, please cite the following: + + * https://doi.org/10.5066/F7251GDH + * https://doi.org/10.1016/j.rse.2017.03.026 + + .. versionadded:: 0.5 + """ + + METAINFO = dict( + classes=("clear", "cloud shadow", "thin cloud", "cloud"), + palette=( + (0, 0, 0), + (85, 85, 85), + (170, 170, 170), + (255, 255, 255), + ), + img_size=(512, 512), # H, W + ann_size=(512, 512), # H, W + ) + + url = "https://hf.co/datasets/torchgeo/l8biome/resolve/f76df19accce34d2acc1878d88b9491bc81f94c8/{}.tar.gz" + + md5s: ClassVar[dict[str, str]] = { + "barren": "0eb691822d03dabd4f5ea8aadd0b41c3", + "forest": "4a5645596f6bb8cea44677f746ec676e", + "grass_crops": "a69ed5d6cb227c5783f026b9303cdd3c", + "shrubland": "19df1d0a604faf6aab46d6a7a5e6da6a", + "snow_ice": "af8b189996cf3f578e40ee12e1f8d0c9", + "urban": "5450195ed95ee225934b9827bea1e8b0", + "water": "a81153415eb662c9e6812c2a8e38c743", + "wetlands": "1f86cc354631ca9a50ce54b7cab3f557", + } + + def __init__( + self, + root: Path | Iterable[Path] = "data/l8_biome", + bands: Sequence[str] = ("B4", "B3", "B2"), + all_transform=None, + img_transform=None, + ann_transform=None, + crs: CRS | None = CRS.from_epsg(3857), + res: float | None = None, + cache: bool = True, + download: bool = False, + checksum: bool = False, + ) -> None: + """Initialize a new L8Biome instance. + + Args: + paths: one or more root directories to search or files to load + crs: :term:`coordinate reference system (CRS)` to warp to + (defaults to EPSG:3857) + res: resolution of the dataset in units of CRS + (defaults to the resolution of the first file found) + bands: bands to return (defaults to all bands) + transforms: a function/transform that takes an input sample + and returns a transformed version + cache: if True, cache file handle to speed up repeated sampling + download: if True, download dataset and store it in the root directory + checksum: if True, check the MD5 of the downloaded files (may be slow) + + Raises: + DatasetNotFoundError: If dataset is not found and *download* is False. + """ + self.paths = root + self.download = download + self.checksum = checksum + + self._verify() + self.all_transform = all_transform + self.img_transform = img_transform + self.ann_transform = ann_transform + self.image = L8BiomeImage( + paths=root, + crs=crs, + res=res, + bands=bands, + all_transform=self.all_transform, + img_transform=self.img_transform, + ann_transform=self.ann_transform, + cache=cache + ) + self.mask = L8BiomeMask( + paths=root, + crs=crs, + res=res, + bands=None, + all_transform=self.all_transform, + img_transform=self.img_transform, + ann_transform=self.ann_transform, + cache=cache + ) + + super().__init__(self.image, self.mask) + + def _verify(self) -> None: + """Verify the integrity of the dataset.""" + # Check if the extracted files already exist + if not isinstance(self.paths, str | pathlib.Path): + return + + for classname in [L8BiomeImage, L8BiomeMask]: + pathname = os.path.join(self.paths, "**", classname.filename_glob) + if not glob.glob(pathname, recursive=True): + break + else: + return + + # Check if the tar.gz files have already been downloaded + pathname = os.path.join(self.paths, "*.tar.gz") + if glob.glob(pathname): + self._extract() + return + + # Check if the user requested to download the dataset + if not self.download: + raise DatasetNotFoundError(self) + + # Download the dataset + self._download() + self._extract() + + def _download(self) -> None: + """Download the dataset.""" + for biome, md5 in self.md5s.items(): + download_url( + self.url.format(biome), self.paths, md5=md5 if self.checksum else None + ) + + def _extract(self) -> None: + """Extract the dataset.""" + assert isinstance(self.paths, str | pathlib.Path) + pathname = os.path.join(self.paths, "*.tar.gz") + for tarfile in glob.iglob(pathname): + extract_archive(tarfile) + + def plot( + self, + sample: dict[str, Tensor], + show_titles: bool = True, + suptitle: str | None = None, + ) -> Figure: + """Plot a sample from the dataset. + + Args: + sample: a sample returned by :meth:`RasterDataset.__getitem__` + show_titles: flag indicating whether to show titles above each panel + suptitle: optional string to use as a suptitle + + Returns: + a matplotlib Figure with the rendered sample + + Raises: + RGBBandsMissingError: If *bands* does not include all RGB bands. + """ + rgb_indices = [] + for band in self.image.rgb_bands: + if band in self.image.bands: + rgb_indices.append(self.image.bands.index(band)) + else: + raise RGBBandsMissingError() + + image = sample["image"][rgb_indices].permute(1, 2, 0) # CxHxW -> HxWxC + + # Stretch to the full range + image = (image - image.min()) / (image.max() - image.min()) + + mask = sample["mask"].numpy().astype("uint8").squeeze() + + num_panels = 2 + showing_predictions = "prediction" in sample + if showing_predictions: + predictions = sample["prediction"].numpy().astype("uint8").squeeze() + num_panels += 1 + + kwargs = {"cmap": "gray", "vmin": 0, "vmax": 4, "interpolation": "none"} + fig, axs = plt.subplots(1, num_panels, figsize=(num_panels * 4, 5)) + axs[0].imshow(image) + axs[0].axis("off") + axs[1].imshow(mask, **kwargs) + axs[1].axis("off") + if show_titles: + axs[0].set_title("Image") + axs[1].set_title("Mask") + + if showing_predictions: + axs[2].imshow(predictions, **kwargs) + axs[2].axis("off") + if show_titles: + axs[2].set_title("Predictions") + + if suptitle is not None: + plt.suptitle(suptitle) + + return fig + + +if __name__ == "__main__": + L8Biome() diff --git a/dataset/l8_biome_crop.py b/dataset/l8_biome_crop.py new file mode 100644 index 0000000..4b58dee --- /dev/null +++ b/dataset/l8_biome_crop.py @@ -0,0 +1,163 @@ +import os +import albumentations +import albumentations.pytorch +import numpy as np +from PIL import Image +from torch.utils.data import Dataset +from typing import List +import tifffile as tf + + +class L8BiomeCrop(Dataset): + METAINFO = dict( + classes = ("Clear", "Cloud Shadow", "Thin Cloud", "Cloud"), + palette=( + (0, 0, 0), + (85, 85, 85), + (170, 170, 170), + (255, 255, 255), + ), + img_size=(512, 512), + ann_size=(512, 512), + ) + + def __init__( + self, + root: str = "data/l8_biome_crop", + bands: List[str] = ["B4", "B3", "B2"], + phase: str = "train", + all_transform: albumentations.Compose = None, + img_transform: albumentations.Compose = None, + ann_transform: albumentations.Compose = None + ): + self.root = root + self.bands = bands + self.phase = phase + self.all_transform = all_transform + self.img_transform = img_transform + self.ann_transform = ann_transform + + self.data = self.load_data() + + + + def load_data(self): + data = [] + maps = { + "barren": "Barren", + "forest": "Forest", + "grass": "Grass/Crops", + "shrubland": "Shrubland", + "snow":"Snow/Ice", + "urban": "Urban", + "water": "Water", + "wetlands":"Wetlands", + } + split_file = os.path.join(self.root, f'{self.phase}.txt') + with open(split_file, 'r') as f: + for line in f: + image_file = line.strip() + img_path = os.path.join(self.root, 'img_dir', image_file) + ann_path = os.path.join(self.root, 'ann_dir', image_file) + lac_type = image_file.split('_')[0] + lac_type = maps[lac_type] + data.append((img_path, ann_path, lac_type)) + return data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + image_path, mask_path, lac_type = self.data[idx] + + image = tf.imread(image_path) # H, W, C + + if len(self.bands)>11: + raise ValueError("The number of bands must be less than 11") + else: + tmp = np.zeros((image.shape[0], image.shape[1], len(self.bands)), dtype=np.float32) + for i, band in enumerate(self.bands): + if band == "B1": + tmp[:,:,i] = image[:,:,0] + elif band == "B2": + tmp[:,:,i] = image[:,:,1] + elif band == "B3": + tmp[:,:,i] = image[:,:,2] + elif band == "B4": + tmp[:,:,i] = image[:,:,3] + elif band == "B5": + tmp[:,:,i] = image[:,:,4] + elif band == "B6": + tmp[:,:,i] = image[:,:,5] + elif band == "B7": + tmp[:,:,i] = image[:,:,6] + elif band == "B8": + tmp[:,:,i] = image[:,:,7] + elif band == "B9": + tmp[:,:,i] = image[:,:,8] + elif band == "B10": + tmp[:,:,i] = image[:,:,9] + elif band == "B11": + tmp[:,:,i] = image[:,:,10] + else: + raise ValueError("The band must be one of 'B1','B2','B3','B4', 'B5','B6', 'B7', 'B8', 'B9', 'B10', 'B11', but got {}".format(band)) + image = tmp + + image = (image - image.min()) / (image.max() - image.min() + 1e-6) # normalize + mask = tf.imread(mask_path) # (H, W) + + if self.all_transform: + transformed = self.all_transform(image=image, mask=mask) + image = transformed["image"] + mask = transformed["mask"] + if self.img_transform: + image = self.img_transform(image=image)["image"] + if self.ann_transform: + mask = self.ann_transform(image=mask)["image"] + + return { + 'img': image, + 'ann': np.int64(mask), + 'img_path': image_path, + 'ann_path': mask_path, + 'lac_type': lac_type, + } + + +def show_l8_biome_crop(): + all_transform = albumentations.Compose([ + albumentations.OneOf([ + albumentations.HorizontalFlip(p=0.5), + albumentations.VerticalFlip(p=0.5), + albumentations.RandomRotate90(p=0.5), + albumentations.Transpose(p=0.5), + ], p=1), + ]) + + img_transform = albumentations.Compose([ + albumentations.pytorch.ToTensorV2(), + ]) + + dataset = L8BiomeCrop( + all_transform=all_transform, + img_transform=img_transform, + ) + + import matplotlib.pyplot as plt + + fig, axs = plt.subplots(1, 2, figsize=(10, 5)) + for sample in dataset: + print(sample['img'].shape, sample['ann'].shape) + axs[0].imshow(sample['img'].permute(1, 2, 0)*2.5) + axs[0].set_title('Image') + color_map = np.array(dataset.METAINFO['palette']) + color_ann = color_map[sample['ann']] + axs[1].imshow(color_ann) + axs[1].set_title('Annotation') + plt.suptitle(f'Land Cover Type: {sample["lac_type"]}') + plt.savefig('l8_biome_crop.png', bbox_inches="tight") + # break + + +if __name__ == '__main__': + show_l8_biome_crop() diff --git a/eval_result/gf1/config.py b/eval_result/gf1/config.py new file mode 100644 index 0000000..dd7c41b --- /dev/null +++ b/eval_result/gf1/config.py @@ -0,0 +1,367 @@ +crop_size = ( + 256, + 256, +) +data_root = 'data/gf12ms_whu_gf1' +dataset_type = 'GF12MSWHUGF1Dataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=3, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'checkpoints/cloud-adapter/gf1_full_weight.bin' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=256, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_256x256.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 256, + 256, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=2, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 2 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), + data_root='data/gf12ms_whu_gf1', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF1Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict(max_iters=80000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/gf12ms_whu_gf1', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF1Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), + data_root='data/gf12ms_whu_gf1', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF1Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf1_test' diff --git a/eval_result/gf1/result.json b/eval_result/gf1/result.json new file mode 100644 index 0000000..441cd14 --- /dev/null +++ b/eval_result/gf1/result.json @@ -0,0 +1,11 @@ +{ + "aAcc": 98.92, + "mIoU": 92.55, + "mAcc": 95.99, + "mDice": 96.02, + "mFscore": 96.02, + "mPrecision": 96.05, + "mRecall": 95.99, + "data_time": 0.003156662920449811, + "time": 0.09255845136138557 +} \ No newline at end of file diff --git a/eval_result/gf1/test.log b/eval_result/gf1/test.log new file mode 100644 index 0000000..d0b7040 --- /dev/null +++ b/eval_result/gf1/test.log @@ -0,0 +1,508 @@ +2024/11/22 14:36:11 - mmengine - INFO - +------------------------------------------------------------ +System environment: + sys.platform: linux + Python: 3.8.20 (default, Oct 3 2024, 15:24:27) [GCC 11.2.0] + CUDA available: True + MUSA available: False + numpy_random_seed: 42 + GPU 0: NVIDIA GeForce RTX 3090 + CUDA_HOME: /usr/local/cuda + NVCC: Cuda compilation tools, release 12.3, V12.3.107 + GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 + PyTorch: 2.0.1 + PyTorch compiling details: PyTorch built with: + - GCC 9.3 + - C++ Version: 201703 + - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications + - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e) + - OpenMP 201511 (a.k.a. OpenMP 4.5) + - LAPACK is enabled (usually provided by MKL) + - NNPACK is enabled + - CPU capability usage: AVX2 + - CUDA Runtime 11.8 + - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_37,code=compute_37 + - CuDNN 8.7 + - Magma 2.6.1 + - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, + + TorchVision: 0.15.2 + OpenCV: 4.10.0 + MMEngine: 0.10.5 + +Runtime environment: + cudnn_benchmark: True + mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} + dist_cfg: {'backend': 'nccl'} + seed: 42 + Distributed launcher: none + Distributed training: False + GPU number: 1 +------------------------------------------------------------ + +2024/11/22 14:36:11 - mmengine - INFO - Config: +crop_size = ( + 256, + 256, +) +data_root = 'data/gf12ms_whu_gf1' +dataset_type = 'GF12MSWHUGF1Dataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=3, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'checkpoints/cloud-adapter/gf1_full_weight.bin' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=256, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_256x256.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 256, + 256, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=2, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 2 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), + data_root='data/gf12ms_whu_gf1', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF1Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict(max_iters=80000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/gf12ms_whu_gf1', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF1Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), + data_root='data/gf12ms_whu_gf1', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF1Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf1_test' + +2024/11/22 14:36:14 - mmengine - INFO - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used. +2024/11/22 14:36:14 - mmengine - INFO - Hooks will be executed in the following order: +before_run: +(VERY_HIGH ) RuntimeInfoHook +(BELOW_NORMAL) LoggerHook + -------------------- +before_train: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_train_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DistSamplerSeedHook + -------------------- +before_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook + -------------------- +after_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_train_epoch: +(NORMAL ) IterTimerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_val_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_val_iter: +(NORMAL ) IterTimerHook + -------------------- +after_val_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_val_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_train: +(VERY_HIGH ) RuntimeInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_test_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_test_iter: +(NORMAL ) IterTimerHook + -------------------- +after_test_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_run: +(BELOW_NORMAL) LoggerHook + -------------------- +2024/11/22 14:36:15 - mmengine - WARNING - The prefix is not set in metric class IoUMetric. +2024/11/22 14:36:17 - mmengine - INFO - Load checkpoint from checkpoints/cloud-adapter/gf1_full_weight.bin +2024/11/22 14:37:52 - mmengine - INFO - per class results: +2024/11/22 14:37:52 - mmengine - INFO - ++-----------+-------+-------+-------+--------+-----------+--------+ +| Class | IoU | Acc | Dice | Fscore | Precision | Recall | ++-----------+-------+-------+-------+--------+-----------+--------+ +| clear sky | 98.84 | 99.42 | 99.41 | 99.41 | 99.41 | 99.42 | +| cloud | 86.27 | 92.56 | 92.63 | 92.63 | 92.7 | 92.56 | ++-----------+-------+-------+-------+--------+-----------+--------+ +2024/11/22 14:37:52 - mmengine - INFO - Iter(test) [1022/1022] aAcc: 98.9200 mIoU: 92.5500 mAcc: 95.9900 mDice: 96.0200 mFscore: 96.0200 mPrecision: 96.0500 mRecall: 95.9900 data_time: 0.0032 time: 0.0926 diff --git a/eval_result/gf2/config.py b/eval_result/gf2/config.py new file mode 100644 index 0000000..df6214a --- /dev/null +++ b/eval_result/gf2/config.py @@ -0,0 +1,367 @@ +crop_size = ( + 256, + 256, +) +data_root = 'data/gf12ms_whu_gf2' +dataset_type = 'GF12MSWHUGF2Dataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'checkpoints/cloud-adapter/gf2_full_weight.bin' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=256, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_256x256.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 256, + 256, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=2, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 2 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), + data_root='data/gf12ms_whu_gf2', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF2Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict(max_iters=40000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/gf12ms_whu_gf2', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF2Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), + data_root='data/gf12ms_whu_gf2', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF2Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf2_test' diff --git a/eval_result/gf2/result.json b/eval_result/gf2/result.json new file mode 100644 index 0000000..13d55d8 --- /dev/null +++ b/eval_result/gf2/result.json @@ -0,0 +1 @@ +{"aAcc": 94.04, "mIoU": 83.02, "mAcc": 87.54, "mDice": 90.4, "mFscore": 90.4, "mPrecision": 94.33, "mRecall": 87.54, "data_time": 0.003199114370598364, "time": 0.09292630770849804} \ No newline at end of file diff --git a/eval_result/gf2/test.log b/eval_result/gf2/test.log new file mode 100644 index 0000000..0797edc --- /dev/null +++ b/eval_result/gf2/test.log @@ -0,0 +1,508 @@ +2024/11/22 14:37:11 - mmengine - INFO - +------------------------------------------------------------ +System environment: + sys.platform: linux + Python: 3.8.20 (default, Oct 3 2024, 15:24:27) [GCC 11.2.0] + CUDA available: True + MUSA available: False + numpy_random_seed: 42 + GPU 0: NVIDIA GeForce RTX 3090 + CUDA_HOME: /usr/local/cuda + NVCC: Cuda compilation tools, release 12.3, V12.3.107 + GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 + PyTorch: 2.0.1 + PyTorch compiling details: PyTorch built with: + - GCC 9.3 + - C++ Version: 201703 + - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications + - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e) + - OpenMP 201511 (a.k.a. OpenMP 4.5) + - LAPACK is enabled (usually provided by MKL) + - NNPACK is enabled + - CPU capability usage: AVX2 + - CUDA Runtime 11.8 + - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_37,code=compute_37 + - CuDNN 8.7 + - Magma 2.6.1 + - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, + + TorchVision: 0.15.2 + OpenCV: 4.10.0 + MMEngine: 0.10.5 + +Runtime environment: + cudnn_benchmark: True + mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} + dist_cfg: {'backend': 'nccl'} + seed: 42 + Distributed launcher: none + Distributed training: False + GPU number: 1 +------------------------------------------------------------ + +2024/11/22 14:37:11 - mmengine - INFO - Config: +crop_size = ( + 256, + 256, +) +data_root = 'data/gf12ms_whu_gf2' +dataset_type = 'GF12MSWHUGF2Dataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'checkpoints/cloud-adapter/gf2_full_weight.bin' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=256, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_256x256.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 256, + 256, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=2, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 2 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), + data_root='data/gf12ms_whu_gf2', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF2Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict(max_iters=40000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/gf12ms_whu_gf2', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF2Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'), + data_root='data/gf12ms_whu_gf2', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='GF12MSWHUGF2Dataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_gf2_test' + +2024/11/22 14:37:14 - mmengine - INFO - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used. +2024/11/22 14:37:14 - mmengine - INFO - Hooks will be executed in the following order: +before_run: +(VERY_HIGH ) RuntimeInfoHook +(BELOW_NORMAL) LoggerHook + -------------------- +before_train: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_train_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DistSamplerSeedHook + -------------------- +before_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook + -------------------- +after_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_train_epoch: +(NORMAL ) IterTimerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_val_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_val_iter: +(NORMAL ) IterTimerHook + -------------------- +after_val_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_val_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_train: +(VERY_HIGH ) RuntimeInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_test_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_test_iter: +(NORMAL ) IterTimerHook + -------------------- +after_test_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_run: +(BELOW_NORMAL) LoggerHook + -------------------- +2024/11/22 14:37:14 - mmengine - WARNING - The prefix is not set in metric class IoUMetric. +2024/11/22 14:37:17 - mmengine - INFO - Load checkpoint from checkpoints/cloud-adapter/gf2_full_weight.bin +2024/11/22 14:40:13 - mmengine - INFO - per class results: +2024/11/22 14:40:13 - mmengine - INFO - ++-----------+-------+-------+-------+--------+-----------+--------+ +| Class | IoU | Acc | Dice | Fscore | Precision | Recall | ++-----------+-------+-------+-------+--------+-----------+--------+ +| clear sky | 92.88 | 98.86 | 96.31 | 96.31 | 93.88 | 98.86 | +| cloud | 73.15 | 76.22 | 84.49 | 84.49 | 94.79 | 76.22 | ++-----------+-------+-------+-------+--------+-----------+--------+ +2024/11/22 14:40:13 - mmengine - INFO - Iter(test) [1890/1890] aAcc: 94.0400 mIoU: 83.0200 mAcc: 87.5400 mDice: 90.4000 mFscore: 90.4000 mPrecision: 94.3300 mRecall: 87.5400 data_time: 0.0032 time: 0.0929 diff --git a/eval_result/hrc/config.py b/eval_result/hrc/config.py new file mode 100644 index 0000000..4f459bd --- /dev/null +++ b/eval_result/hrc/config.py @@ -0,0 +1,367 @@ +crop_size = ( + 256, + 256, +) +data_root = 'data/hrc_whu' +dataset_type = 'HRCWHUDataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'checkpoints/cloud-adapter/hrc_whu_full_weight.bin' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=256, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_256x256.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 256, + 256, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=2, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 2 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/hrc_whu', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='HRCWHUDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict(max_iters=40000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/hrc_whu', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='HRCWHUDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/hrc_whu', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='HRCWHUDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_hrc_whu_test' diff --git a/eval_result/hrc/result.json b/eval_result/hrc/result.json new file mode 100644 index 0000000..ea46a4b --- /dev/null +++ b/eval_result/hrc/result.json @@ -0,0 +1 @@ +{"aAcc": 94.5, "mIoU": 89.05, "mAcc": 93.74, "mDice": 94.19, "mFscore": 94.19, "mPrecision": 94.77, "mRecall": 93.74, "data_time": 0.014577388763427734, "time": 0.20791089534759521} \ No newline at end of file diff --git a/eval_result/hrc/test.log b/eval_result/hrc/test.log new file mode 100644 index 0000000..5c0f825 --- /dev/null +++ b/eval_result/hrc/test.log @@ -0,0 +1,508 @@ +2024/11/22 14:45:34 - mmengine - INFO - +------------------------------------------------------------ +System environment: + sys.platform: linux + Python: 3.8.20 (default, Oct 3 2024, 15:24:27) [GCC 11.2.0] + CUDA available: True + MUSA available: False + numpy_random_seed: 42 + GPU 0: NVIDIA GeForce RTX 3090 + CUDA_HOME: /usr/local/cuda + NVCC: Cuda compilation tools, release 12.3, V12.3.107 + GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 + PyTorch: 2.0.1 + PyTorch compiling details: PyTorch built with: + - GCC 9.3 + - C++ Version: 201703 + - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications + - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e) + - OpenMP 201511 (a.k.a. OpenMP 4.5) + - LAPACK is enabled (usually provided by MKL) + - NNPACK is enabled + - CPU capability usage: AVX2 + - CUDA Runtime 11.8 + - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_37,code=compute_37 + - CuDNN 8.7 + - Magma 2.6.1 + - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, + + TorchVision: 0.15.2 + OpenCV: 4.10.0 + MMEngine: 0.10.5 + +Runtime environment: + cudnn_benchmark: True + mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} + dist_cfg: {'backend': 'nccl'} + seed: 42 + Distributed launcher: none + Distributed training: False + GPU number: 1 +------------------------------------------------------------ + +2024/11/22 14:45:34 - mmengine - INFO - Config: +crop_size = ( + 256, + 256, +) +data_root = 'data/hrc_whu' +dataset_type = 'HRCWHUDataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'checkpoints/cloud-adapter/hrc_whu_full_weight.bin' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=256, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_256x256.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 256, + 256, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=2, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 2 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/hrc_whu', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='HRCWHUDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict(max_iters=40000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/hrc_whu', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='HRCWHUDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 256, + 256, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/hrc_whu', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 256, + 256, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='HRCWHUDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_hrc_whu_test' + +2024/11/22 14:45:37 - mmengine - INFO - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used. +2024/11/22 14:45:37 - mmengine - INFO - Hooks will be executed in the following order: +before_run: +(VERY_HIGH ) RuntimeInfoHook +(BELOW_NORMAL) LoggerHook + -------------------- +before_train: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_train_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DistSamplerSeedHook + -------------------- +before_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook + -------------------- +after_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_train_epoch: +(NORMAL ) IterTimerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_val_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_val_iter: +(NORMAL ) IterTimerHook + -------------------- +after_val_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_val_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_train: +(VERY_HIGH ) RuntimeInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_test_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_test_iter: +(NORMAL ) IterTimerHook + -------------------- +after_test_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_run: +(BELOW_NORMAL) LoggerHook + -------------------- +2024/11/22 14:45:38 - mmengine - WARNING - The prefix is not set in metric class IoUMetric. +2024/11/22 14:45:40 - mmengine - INFO - Load checkpoint from checkpoints/cloud-adapter/hrc_whu_full_weight.bin +2024/11/22 14:45:42 - mmengine - INFO - per class results: +2024/11/22 14:45:42 - mmengine - INFO - ++-----------+-------+-------+-------+--------+-----------+--------+ +| Class | IoU | Acc | Dice | Fscore | Precision | Recall | ++-----------+-------+-------+-------+--------+-----------+--------+ +| clear sky | 91.44 | 97.42 | 95.53 | 95.53 | 93.71 | 97.42 | +| cloud | 86.65 | 90.05 | 92.85 | 92.85 | 95.83 | 90.05 | ++-----------+-------+-------+-------+--------+-----------+--------+ +2024/11/22 14:45:42 - mmengine - INFO - Iter(test) [8/8] aAcc: 94.5000 mIoU: 89.0500 mAcc: 93.7400 mDice: 94.1900 mFscore: 94.1900 mPrecision: 94.7700 mRecall: 93.7400 data_time: 0.0146 time: 0.2079 diff --git a/eval_result/l1c/config.py b/eval_result/l1c/config.py new file mode 100644 index 0000000..501eb45 --- /dev/null +++ b/eval_result/l1c/config.py @@ -0,0 +1,369 @@ +crop_size = ( + 512, + 512, +) +data_root = 'data/cloudsen12_high_l1c' +dataset_type = 'CLOUDSEN12HIGHL1CDataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'checkpoints/cloud-adapter/l1c_full_weight.bin' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=512, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_512x512.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 512, + 512, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=4, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 4 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/cloudsen12_high_l1c', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL1CDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict(max_iters=40000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/cloudsen12_high_l1c', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL1CDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/cloudsen12_high_l1c', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL1CDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_test' diff --git a/eval_result/l1c/result.json b/eval_result/l1c/result.json new file mode 100644 index 0000000..4c39c7e --- /dev/null +++ b/eval_result/l1c/result.json @@ -0,0 +1,11 @@ +{ + "aAcc": 90.19, + "mIoU": 74.18, + "mAcc": 84.79, + "mDice": 84.46, + "mFscore": 84.46, + "mPrecision": 84.2, + "mRecall": 84.79, + "data_time": 0.005092643323491831, + "time": 0.32215978473913476 +} \ No newline at end of file diff --git a/eval_result/l1c/test.log b/eval_result/l1c/test.log new file mode 100644 index 0000000..14191ef --- /dev/null +++ b/eval_result/l1c/test.log @@ -0,0 +1,512 @@ +2024/11/22 14:28:46 - mmengine - INFO - +------------------------------------------------------------ +System environment: + sys.platform: linux + Python: 3.8.20 (default, Oct 3 2024, 15:24:27) [GCC 11.2.0] + CUDA available: True + MUSA available: False + numpy_random_seed: 42 + GPU 0: NVIDIA GeForce RTX 3090 + CUDA_HOME: /usr/local/cuda + NVCC: Cuda compilation tools, release 12.3, V12.3.107 + GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 + PyTorch: 2.0.1 + PyTorch compiling details: PyTorch built with: + - GCC 9.3 + - C++ Version: 201703 + - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications + - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e) + - OpenMP 201511 (a.k.a. OpenMP 4.5) + - LAPACK is enabled (usually provided by MKL) + - NNPACK is enabled + - CPU capability usage: AVX2 + - CUDA Runtime 11.8 + - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_37,code=compute_37 + - CuDNN 8.7 + - Magma 2.6.1 + - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, + + TorchVision: 0.15.2 + OpenCV: 4.10.0 + MMEngine: 0.10.5 + +Runtime environment: + cudnn_benchmark: True + mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} + dist_cfg: {'backend': 'nccl'} + seed: 42 + Distributed launcher: none + Distributed training: False + GPU number: 1 +------------------------------------------------------------ + +2024/11/22 14:28:46 - mmengine - INFO - Config: +crop_size = ( + 512, + 512, +) +data_root = 'data/cloudsen12_high_l1c' +dataset_type = 'CLOUDSEN12HIGHL1CDataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'checkpoints/cloud-adapter/l1c_full_weight.bin' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=512, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_512x512.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 512, + 512, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=4, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 4 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/cloudsen12_high_l1c', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL1CDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict(max_iters=40000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/cloudsen12_high_l1c', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL1CDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/cloudsen12_high_l1c', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL1CDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_test' + +2024/11/22 14:28:49 - mmengine - INFO - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used. +2024/11/22 14:28:49 - mmengine - INFO - Hooks will be executed in the following order: +before_run: +(VERY_HIGH ) RuntimeInfoHook +(BELOW_NORMAL) LoggerHook + -------------------- +before_train: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_train_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DistSamplerSeedHook + -------------------- +before_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook + -------------------- +after_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_train_epoch: +(NORMAL ) IterTimerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_val_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_val_iter: +(NORMAL ) IterTimerHook + -------------------- +after_val_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_val_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_train: +(VERY_HIGH ) RuntimeInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_test_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_test_iter: +(NORMAL ) IterTimerHook + -------------------- +after_test_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_run: +(BELOW_NORMAL) LoggerHook + -------------------- +2024/11/22 14:28:50 - mmengine - WARNING - The prefix is not set in metric class IoUMetric. +2024/11/22 14:28:52 - mmengine - INFO - Load checkpoint from checkpoints/cloud-adapter/l1c_full_weight.bin +2024/11/22 14:30:11 - mmengine - INFO - per class results: +2024/11/22 14:30:11 - mmengine - INFO - ++--------------+-------+-------+-------+--------+-----------+--------+ +| Class | IoU | Acc | Dice | Fscore | Precision | Recall | ++--------------+-------+-------+-------+--------+-----------+--------+ +| clear | 89.19 | 94.08 | 94.29 | 94.29 | 94.5 | 94.08 | +| thick cloud | 85.46 | 91.72 | 92.16 | 92.16 | 92.61 | 91.72 | +| thin cloud | 56.15 | 75.23 | 71.91 | 71.91 | 68.88 | 75.23 | +| cloud shadow | 65.93 | 78.15 | 79.47 | 79.47 | 80.83 | 78.15 | ++--------------+-------+-------+-------+--------+-----------+--------+ +2024/11/22 14:30:11 - mmengine - INFO - Iter(test) [244/244] aAcc: 90.1900 mIoU: 74.1800 mAcc: 84.7900 mDice: 84.4600 mFscore: 84.4600 mPrecision: 84.2000 mRecall: 84.7900 data_time: 0.0051 time: 0.3222 diff --git a/eval_result/l2a/config.py b/eval_result/l2a/config.py new file mode 100644 index 0000000..0aa75de --- /dev/null +++ b/eval_result/l2a/config.py @@ -0,0 +1,369 @@ +crop_size = ( + 512, + 512, +) +data_root = 'data/cloudsen12_high_l2a' +dataset_type = 'CLOUDSEN12HIGHL2ADataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'checkpoints/cloud-adapter/l2a_full_weight.bin' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=512, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_512x512.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 512, + 512, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=4, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 4 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/cloudsen12_high_l2a', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL2ADataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict(max_iters=40000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/cloudsen12_high_l2a', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL2ADataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/cloudsen12_high_l2a', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL2ADataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l2a_test' diff --git a/eval_result/l2a/result.json b/eval_result/l2a/result.json new file mode 100644 index 0000000..0383824 --- /dev/null +++ b/eval_result/l2a/result.json @@ -0,0 +1 @@ +{"aAcc": 89.9, "mIoU": 73.38, "mAcc": 83.93, "mDice": 83.87, "mFscore": 83.87, "mPrecision": 83.89, "mRecall": 83.93, "data_time": 0.004904283851873679, "time": 0.32561323388678126} \ No newline at end of file diff --git a/eval_result/l2a/test.log b/eval_result/l2a/test.log new file mode 100644 index 0000000..39890ac --- /dev/null +++ b/eval_result/l2a/test.log @@ -0,0 +1,512 @@ +2024/11/22 14:26:58 - mmengine - INFO - +------------------------------------------------------------ +System environment: + sys.platform: linux + Python: 3.8.20 (default, Oct 3 2024, 15:24:27) [GCC 11.2.0] + CUDA available: True + MUSA available: False + numpy_random_seed: 42 + GPU 0: NVIDIA GeForce RTX 3090 + CUDA_HOME: /usr/local/cuda + NVCC: Cuda compilation tools, release 12.3, V12.3.107 + GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 + PyTorch: 2.0.1 + PyTorch compiling details: PyTorch built with: + - GCC 9.3 + - C++ Version: 201703 + - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications + - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e) + - OpenMP 201511 (a.k.a. OpenMP 4.5) + - LAPACK is enabled (usually provided by MKL) + - NNPACK is enabled + - CPU capability usage: AVX2 + - CUDA Runtime 11.8 + - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_37,code=compute_37 + - CuDNN 8.7 + - Magma 2.6.1 + - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, + + TorchVision: 0.15.2 + OpenCV: 4.10.0 + MMEngine: 0.10.5 + +Runtime environment: + cudnn_benchmark: True + mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} + dist_cfg: {'backend': 'nccl'} + seed: 42 + Distributed launcher: none + Distributed training: False + GPU number: 1 +------------------------------------------------------------ + +2024/11/22 14:26:58 - mmengine - INFO - Config: +crop_size = ( + 512, + 512, +) +data_root = 'data/cloudsen12_high_l2a' +dataset_type = 'CLOUDSEN12HIGHL2ADataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'checkpoints/cloud-adapter/l2a_full_weight.bin' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=512, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_512x512.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 512, + 512, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=4, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 4 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/cloudsen12_high_l2a', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL2ADataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict(max_iters=40000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/cloudsen12_high_l2a', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL2ADataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/cloudsen12_high_l2a', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='CLOUDSEN12HIGHL2ADataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l2a_test' + +2024/11/22 14:27:01 - mmengine - INFO - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used. +2024/11/22 14:27:01 - mmengine - INFO - Hooks will be executed in the following order: +before_run: +(VERY_HIGH ) RuntimeInfoHook +(BELOW_NORMAL) LoggerHook + -------------------- +before_train: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_train_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DistSamplerSeedHook + -------------------- +before_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook + -------------------- +after_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_train_epoch: +(NORMAL ) IterTimerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_val_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_val_iter: +(NORMAL ) IterTimerHook + -------------------- +after_val_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_val_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_train: +(VERY_HIGH ) RuntimeInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_test_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_test_iter: +(NORMAL ) IterTimerHook + -------------------- +after_test_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_run: +(BELOW_NORMAL) LoggerHook + -------------------- +2024/11/22 14:27:02 - mmengine - WARNING - The prefix is not set in metric class IoUMetric. +2024/11/22 14:27:04 - mmengine - INFO - Load checkpoint from checkpoints/cloud-adapter/l2a_full_weight.bin +2024/11/22 14:28:24 - mmengine - INFO - per class results: +2024/11/22 14:28:24 - mmengine - INFO - ++--------------+-------+-------+-------+--------+-----------+--------+ +| Class | IoU | Acc | Dice | Fscore | Precision | Recall | ++--------------+-------+-------+-------+--------+-----------+--------+ +| clear | 89.04 | 94.65 | 94.2 | 94.2 | 93.75 | 94.65 | +| thick cloud | 84.91 | 90.61 | 91.84 | 91.84 | 93.1 | 90.61 | +| thin cloud | 54.97 | 73.64 | 70.95 | 70.95 | 68.44 | 73.64 | +| cloud shadow | 64.6 | 76.81 | 78.49 | 78.49 | 80.25 | 76.81 | ++--------------+-------+-------+-------+--------+-----------+--------+ +2024/11/22 14:28:24 - mmengine - INFO - Iter(test) [244/244] aAcc: 89.9000 mIoU: 73.3800 mAcc: 83.9300 mDice: 83.8700 mFscore: 83.8700 mPrecision: 83.8900 mRecall: 83.9300 data_time: 0.0049 time: 0.3256 diff --git a/eval_result/l8b/config.py b/eval_result/l8b/config.py new file mode 100644 index 0000000..5a8c5fa --- /dev/null +++ b/eval_result/l8b/config.py @@ -0,0 +1,370 @@ +crop_size = ( + 512, + 512, +) +data_root = 'data/l8_biome' +dataset_type = 'L8BIOMEDataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'work_dirs/ours_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head_40w/l8_full_weight.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=512, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_512x512.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 512, + 512, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=4, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 4 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/l8_biome', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='L8BIOMEDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict( + max_iters=40000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/l8_biome', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='L8BIOMEDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/l8_biome', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='L8BIOMEDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head_40w_test' diff --git a/eval_result/l8b/l8_scene.json b/eval_result/l8b/l8_scene.json new file mode 100644 index 0000000..763dab1 --- /dev/null +++ b/eval_result/l8b/l8_scene.json @@ -0,0 +1,50 @@ +{ + "Grass/Crops": { + "mIoU": 0.41599829157725726, + "aAcc": 0.7271153227297845, + "mAcc": 0.6309999161890324, + "mDice": 0.48122983451729484 + }, + "Urban": { + "mIoU": 0.517627658617676, + "aAcc": 0.8626746668960109, + "mAcc": 0.768392929150688, + "mDice": 0.5836001147573872 + }, + "Wetlands": { + "mIoU": 0.501559420240926, + "aAcc": 0.8608137235174261, + "mAcc": 0.742586251323189, + "mDice": 0.5708926765107641 + }, + "Snow/Ice": { + "mIoU": 0.32010792407024374, + "aAcc": 0.7314604813803728, + "mAcc": 0.5698867728215837, + "mDice": 0.3624512298242071 + }, + "Barren": { + "mIoU": 0.4895024041433955, + "aAcc": 0.8348682487272544, + "mAcc": 0.723505101972713, + "mDice": 0.564495123817441 + }, + "Forest": { + "mIoU": 0.4809264687575488, + "aAcc": 0.8229694317169584, + "mAcc": 0.6876376428558933, + "mDice": 0.5625475973621544 + }, + "Shrubland": { + "mIoU": 0.5117673534353582, + "aAcc": 0.874197554744147, + "mAcc": 0.7201497797290773, + "mDice": 0.5919133188034655 + }, + "Water": { + "mIoU": 0.4643129164236885, + "aAcc": 0.8205382993715835, + "mAcc": 0.6775471505540217, + "mDice": 0.5326135977864226 + } +} \ No newline at end of file diff --git a/eval_result/l8b/result.json b/eval_result/l8b/result.json new file mode 100644 index 0000000..66d6eb1 --- /dev/null +++ b/eval_result/l8b/result.json @@ -0,0 +1 @@ +{"aAcc": 81.79, "mIoU": 57.53, "mAcc": 69.34, "mDice": 70.46, "mFscore": 70.46, "mPrecision": 77.43, "mRecall": 69.34, "data_time": 0.004421413756354674, "time": 0.32063033829658727} \ No newline at end of file diff --git a/eval_result/l8b/test.log b/eval_result/l8b/test.log new file mode 100644 index 0000000..37e4059 --- /dev/null +++ b/eval_result/l8b/test.log @@ -0,0 +1,513 @@ +2024/11/22 15:22:22 - mmengine - INFO - +------------------------------------------------------------ +System environment: + sys.platform: linux + Python: 3.8.20 (default, Oct 3 2024, 15:24:27) [GCC 11.2.0] + CUDA available: True + MUSA available: False + numpy_random_seed: 42 + GPU 0: NVIDIA GeForce RTX 3090 + CUDA_HOME: /usr/local/cuda + NVCC: Cuda compilation tools, release 12.3, V12.3.107 + GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 + PyTorch: 2.0.1 + PyTorch compiling details: PyTorch built with: + - GCC 9.3 + - C++ Version: 201703 + - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications + - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e) + - OpenMP 201511 (a.k.a. OpenMP 4.5) + - LAPACK is enabled (usually provided by MKL) + - NNPACK is enabled + - CPU capability usage: AVX2 + - CUDA Runtime 11.8 + - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_37,code=compute_37 + - CuDNN 8.7 + - Magma 2.6.1 + - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.8, CUDNN_VERSION=8.7.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=ON, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, + + TorchVision: 0.15.2 + OpenCV: 4.10.0 + MMEngine: 0.10.5 + +Runtime environment: + cudnn_benchmark: True + mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} + dist_cfg: {'backend': 'nccl'} + seed: 42 + Distributed launcher: none + Distributed training: False + GPU number: 1 +------------------------------------------------------------ + +2024/11/22 15:22:22 - mmengine - INFO - Config: +crop_size = ( + 512, + 512, +) +data_root = 'data/l8_biome' +dataset_type = 'L8BIOMEDataset' +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=4000, + max_keep_ckpts=1, + rule='greater', + save_best=[ + 'mIoU', + ], + type='CheckpointHook'), + logger=dict(interval=4000, log_metric_by_epoch=False, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='SegVisualizationHook')) +default_scope = 'mmseg' +embed_multi = dict(decay_mult=0.0, lr_mult=1.0) +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +launcher = 'none' +load_from = 'work_dirs/ours_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head_40w/l8_full_weight.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=False) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=512, + init_cfg=dict( + checkpoint='checkpoints/dinov2_converted_512x512.pth', + type='Pretrained'), + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 512, + 512, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=4, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') +num_classes = 4 +optim_wrapper = dict( + constructor='PEFTOptimWrapperConstructor', + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + eps=1e-08, + lr=0.0001, + type='AdamW', + weight_decay=0.05), + paramwise_cfg=dict( + custom_keys=dict({ + 'learnable_tokens': dict(decay_mult=0.0, lr_mult=1.0), + 'level_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'norm': dict(decay_mult=0.0), + 'query_embed': dict(decay_mult=0.0, lr_mult=1.0), + 'reins.scale': dict(decay_mult=0.0, lr_mult=1.0) + }), + norm_decay_mult=0.0)) +param_scheduler = [ + dict( + begin=0, + by_epoch=False, + end=40000, + eta_min=0, + power=0.9, + type='PolyLR'), +] +randomness = dict(seed=42) +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/l8_biome', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='L8BIOMEDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), +] +train_cfg = dict( + max_iters=400000, type='IterBasedTrainLoop', val_interval=4000) +train_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict( + img_path='img_dir/train', seg_map_path='ann_dir/train'), + data_root='data/l8_biome', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), + ], + type='L8BIOMEDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='InfiniteSampler')) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(crop_size=( + 512, + 512, + ), type='RandomCrop'), + dict(prob=0.5, type='RandomFlip'), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs'), +] +tta_model = dict(type='SegTTAModel') +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=4, + dataset=dict( + data_prefix=dict(img_path='img_dir/test', seg_map_path='ann_dir/test'), + data_root='data/l8_biome', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(scale=( + 512, + 512, + ), type='Resize'), + dict(type='LoadAnnotations'), + dict(type='PackSegInputs'), + ], + type='L8BIOMEDataset'), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + iou_metrics=[ + 'mIoU', + 'mDice', + 'mFscore', + ], type='IoUMetric') +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + name='visualizer', + type='SegLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + ]) +work_dir = './work_dirs/cloud_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head_40w_test' + +2024/11/22 15:22:25 - mmengine - INFO - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used. +2024/11/22 15:22:25 - mmengine - INFO - Hooks will be executed in the following order: +before_run: +(VERY_HIGH ) RuntimeInfoHook +(BELOW_NORMAL) LoggerHook + -------------------- +before_train: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_train_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DistSamplerSeedHook + -------------------- +before_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook + -------------------- +after_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_train_epoch: +(NORMAL ) IterTimerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_val_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_val_iter: +(NORMAL ) IterTimerHook + -------------------- +after_val_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_val_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_train: +(VERY_HIGH ) RuntimeInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +before_test_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_test_iter: +(NORMAL ) IterTimerHook + -------------------- +after_test_iter: +(NORMAL ) IterTimerHook +(NORMAL ) SegVisualizationHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_run: +(BELOW_NORMAL) LoggerHook + -------------------- +2024/11/22 15:22:25 - mmengine - WARNING - The prefix is not set in metric class IoUMetric. +2024/11/22 15:22:26 - mmengine - INFO - Load checkpoint from work_dirs/ours_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head_40w/l8_full_weight.pth +2024/11/22 15:25:59 - mmengine - INFO - per class results: +2024/11/22 15:25:59 - mmengine - INFO - ++--------------+-------+-------+-------+--------+-----------+--------+ +| Class | IoU | Acc | Dice | Fscore | Precision | Recall | ++--------------+-------+-------+-------+--------+-----------+--------+ +| Clear | 85.81 | 90.63 | 92.36 | 92.36 | 94.16 | 90.63 | +| Cloud Shadow | 29.13 | 32.65 | 45.12 | 45.12 | 73.03 | 32.65 | +| Thin Cloud | 44.41 | 78.76 | 61.51 | 61.51 | 50.46 | 78.76 | +| Cloud | 70.75 | 75.35 | 82.87 | 82.87 | 92.05 | 75.35 | ++--------------+-------+-------+-------+--------+-----------+--------+ +2024/11/22 15:25:59 - mmengine - INFO - Iter(test) [661/661] aAcc: 81.7900 mIoU: 57.5300 mAcc: 69.3400 mDice: 70.4600 mFscore: 70.4600 mPrecision: 77.4300 mRecall: 69.3400 data_time: 0.0044 time: 0.3206 diff --git a/hugging_face/app.py b/hugging_face/app.py new file mode 100644 index 0000000..0d567b1 --- /dev/null +++ b/hugging_face/app.py @@ -0,0 +1,227 @@ +from mmseg.apis import init_model +from typing import List +from glob import glob +from cloud_adapter.cloud_adapter_dinov2 import CloudAdapterDinoVisionTransformer +import numpy as np +from PIL import Image +from mmseg.models.segmentors.encoder_decoder import EncoderDecoder +import gradio as gr +import torch +import os + + +class CloudAdapterGradio: + def __init__(self, config_path=None, checkpoint_path=None, device="cpu", example_inputs=None, num_classes=2, palette=None): + self.config_path = config_path + self.checkpoint_path = checkpoint_path + self.device = device + self.model: EncoderDecoder = init_model( + self.config_path, self.checkpoint_path, device=self.device) + self.model.eval() + self.example_inputs = example_inputs + self.img_size = 256 if num_classes == 2 else 512 + self.palette = palette + self.legend = self.html_legend(num_classes=num_classes) + + self.create_ui() + + def html_legend(self, num_classes=2): + if num_classes == 2: + return """ +
+
+
+ Clear +
+
+
+ Cloud +
+
+ """ + return """ +
+
+
+ Clear Sky +
+
+
+ Thick Cloud +
+
+
+ Thin Cloud +
+
+
+ Cloud Shadow +
+
+""" + + def create_ui(self): + with gr.Row(): + # 左侧:输入图片和按钮 + with gr.Column(scale=1): # 左侧列 + in_image = gr.Image( + label='Input Image', + sources='upload', + elem_classes='input_image', + interactive=True, + type="pil", + ) + with gr.Row(): + run_button = gr.Button( + 'Run', + variant="primary", + ) + # 示例输入列表 + gr.Examples( + examples=self.example_inputs, + inputs=in_image, + label="Example Inputs" + ) + + # 右侧:输出图片 + with gr.Column(scale=1): # 右侧列 + with gr.Column(): + # 输出图片 + out_image = gr.Image( + label='Output Image', + elem_classes='output_image', + interactive=False + ) + # 图例 + legend = gr.HTML( + value=self.legend, + elem_classes="output_legend", + ) + + # 按钮点击逻辑:触发图像转换 + run_button.click( + self.inference, + inputs=in_image, + outputs=out_image, + ) + + @torch.no_grad() + def inference(self, image: Image.Image) -> Image.Image: + return self.cloud_adapter_forward(image) + + @torch.no_grad() + def cloud_adapter_forward(self, image: Image.Image) -> Image.Image: + """ + Cloud Adapter Inference + """ + ori_size = image.size + image = image.resize((self.img_size, self.img_size), + resample=Image.Resampling.BILINEAR) + image = np.array(image) + # print(image.shape) + image = (image - np.min(image)) / (np.max(image)-np.min(image)) + + image = torch.from_numpy(image).unsqueeze(0).to(self.device) + image = image.permute(0, 3, 1, 2).float() + + outs = self.model.predict(image) + pred_mask = outs[0].pred_sem_seg.data.cpu().numpy().astype(np.uint8) + + im = Image.fromarray(pred_mask[0]).convert("P") + im.putpalette(self.palette) + + del image + del outs + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return im.resize(ori_size, resample=Image.Resampling.BILINEAR) + + +def get_palette(dataset_name: str) -> List[int]: + if dataset_name in ["cloudsen12_high_l1c", "cloudsen12_high_l2a"]: + return [79, 253, 199, 77, 2, 115, 251, 255, 41, 221, 53, 223] + if dataset_name == "l8_biome": + return [79, 253, 199, 221, 53, 223, 251, 255, 41, 77, 2, 115] + if dataset_name in ["gf12ms_whu_gf1", "gf12ms_whu_gf2", "hrc_whu"]: + return [79, 253, 199, 77, 2, 115] + raise Exception("dataset_name not supported") + + +if __name__ == '__main__': + title = 'Cloud Segmentation for Remote Sensing Images' + custom_css = """ +h1 { + text-align: center; + font-size: 24px; + font-weight: bold; + margin-bottom: 20px; +} +""" + hrc_whu_examples = glob("example_inputs/hrc_whu/*") + gf1_examples = glob("example_inputs/gf1/*") + gf2_examples = glob("example_inputs/gf2/*") + l1c_examples = glob("example_inputs/l1c/*") + l2a_examples = glob("example_inputs/l2a/*") + l8_examples = glob("example_inputs/l8/*") + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + with gr.Blocks(analytics_enabled=False, title=title,css=custom_css) as demo: + gr.Markdown(f'# {title}') + with gr.Tabs(): + with gr.TabItem('Google Earth'): + CloudAdapterGradio( + config_path="cloud-adapter-configs/binary_classes_256x256.py", + checkpoint_path="checkpoints/cloud-adapter/hrc_whu_full_weight.pth", + device=device, + example_inputs=hrc_whu_examples, + num_classes=2, + palette=get_palette("hrc_whu"), + ) + with gr.TabItem('Gaofen-1'): + CloudAdapterGradio( + config_path="cloud-adapter-configs/binary_classes_256x256.py", + checkpoint_path="checkpoints/cloud-adapter/gf1_full_weight.pth", + device=device, + example_inputs=gf1_examples, + num_classes=2, + palette=get_palette("gf12ms_whu_gf1"), + ) + with gr.TabItem('Gaofen-2'): + CloudAdapterGradio( + config_path="cloud-adapter-configs/binary_classes_256x256.py", + checkpoint_path="checkpoints/cloud-adapter/gf2_full_weight.pth", + device=device, + example_inputs=gf2_examples, + num_classes=2, + palette=get_palette("gf12ms_whu_gf2"), + ) + + with gr.TabItem('Sentinel-2 (L1C)'): + CloudAdapterGradio( + config_path="cloud-adapter-configs/multi_classes_512x512.py", + checkpoint_path="checkpoints/cloud-adapter/l1c_full_weight.pth", + device=device, + example_inputs=l1c_examples, + num_classes=4, + palette=get_palette("cloudsen12_high_l1c"), + ) + with gr.TabItem('Sentinel-2 (L2A)'): + CloudAdapterGradio( + config_path="cloud-adapter-configs/multi_classes_512x512.py", + checkpoint_path="checkpoints/cloud-adapter/l2a_full_weight.pth", + device=device, + example_inputs=l2a_examples, + num_classes=4, + palette=get_palette("cloudsen12_high_l2a"), + ) + with gr.TabItem('Landsat-8'): + CloudAdapterGradio( + config_path="cloud-adapter-configs/multi_classes_512x512.py", + checkpoint_path="checkpoints/cloud-adapter/l8_full_weight.pth", + device=device, + example_inputs=l8_examples, + num_classes=4, + palette=get_palette("l8_biome"), + ) + + demo.launch(share=True, debug=True) diff --git a/hugging_face/cloud-adapter-configs/binary_classes_256x256.py b/hugging_face/cloud-adapter-configs/binary_classes_256x256.py new file mode 100644 index 0000000..af42b64 --- /dev/null +++ b/hugging_face/cloud-adapter-configs/binary_classes_256x256.py @@ -0,0 +1,205 @@ +crop_size = ( + 256, + 256, +) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=512, + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 512, + 512, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=2, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') + + + diff --git a/hugging_face/cloud-adapter-configs/multi_classes_512x512.py b/hugging_face/cloud-adapter-configs/multi_classes_512x512.py new file mode 100644 index 0000000..aac53df --- /dev/null +++ b/hugging_face/cloud-adapter-configs/multi_classes_512x512.py @@ -0,0 +1,205 @@ +crop_size = ( + 512, + 512, +) +model = dict( + backbone=dict( + adapter_index=[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + ], + block_chunks=0, + depth=24, + embed_dim=1024, + ffn_bias=True, + ffn_layer='mlp', + has_cat=False, + img_size=512, + init_values=1e-05, + mlp_ratio=4, + num_heads=16, + cloud_adapter_config=dict( + cnn_type='pmaa', + context_dim=64, + depth=4, + emd_dim=1024, + global_groups=1, + hidden_channels=64, + int_type='convnext', + local_groups=1, + num_layers=24, + rank_dim=16, + return_last_feature=False, + return_multi_feats=False, + type='CloudAdapter'), + patch_size=16, + proj_bias=True, + qkv_bias=True, + type='CloudAdapterDinoVisionTransformer'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_val=0, + seg_pad_val=255, + size=( + 512, + 512, + ), + std=[ + 58.395, + 57.12, + 57.375, + ], + type='SegDataPreProcessor'), + decode_head=dict( + align_corners=False, + enforce_decoder_input_project=False, + feat_channels=256, + in_channels=[ + 1024, + 1024, + 1024, + 1024, + ], + loss_cls=dict( + class_weight=[ + 1.0, + 1.0, + 1.0, + 1.0, + 0.1, + ], + loss_weight=2.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=False), + loss_dice=dict( + activate=True, + eps=1.0, + loss_weight=5.0, + naive_dice=True, + reduction='mean', + type='mmdet.DiceLoss', + use_sigmoid=True), + loss_mask=dict( + loss_weight=5.0, + reduction='mean', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + num_classes=4, + num_queries=100, + num_transformer_feat_level=3, + out_channels=256, + pixel_decoder=dict( + act_cfg=dict(type='ReLU'), + encoder=dict( + init_cfg=None, + layer_cfg=dict( + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + embed_dims=256, + feedforward_channels=1024, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + batch_first=True, + dropout=0.0, + embed_dims=256, + im2col_step=64, + init_cfg=None, + norm_cfg=None, + num_heads=8, + num_levels=3, + num_points=4)), + num_layers=6), + init_cfg=None, + norm_cfg=dict(num_groups=32, type='GN'), + num_outs=3, + positional_encoding=dict(normalize=True, num_feats=128), + type='mmdet.MSDeformAttnPixelDecoder'), + positional_encoding=dict(normalize=True, num_feats=128), + strides=[ + 4, + 8, + 16, + 32, + ], + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + use_sigmoid=True, + weight=5.0), + dict( + eps=1.0, + pred_act=True, + type='mmdet.DiceCost', + weight=5.0), + ], + type='mmdet.HungarianAssigner'), + importance_sample_ratio=0.75, + num_points=12544, + oversample_ratio=3.0, + sampler=dict(type='mmdet.MaskPseudoSampler')), + transformer_decoder=dict( + init_cfg=None, + layer_cfg=dict( + cross_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0), + ffn_cfg=dict( + act_cfg=dict(inplace=True, type='ReLU'), + add_identity=True, + dropout_layer=None, + embed_dims=256, + feedforward_channels=2048, + ffn_drop=0.0, + num_fcs=2), + self_attn_cfg=dict( + attn_drop=0.0, + batch_first=True, + dropout_layer=None, + embed_dims=256, + num_heads=8, + proj_drop=0.0)), + num_layers=9, + return_intermediate=True), + type='Mask2FormerHead'), + test_cfg=dict(mode='whole'), + train_cfg=dict(), + type='EncoderDecoder') + + + diff --git a/hugging_face/cloud_adapter/__init__.py b/hugging_face/cloud_adapter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hugging_face/cloud_adapter/cdnetv1.py b/hugging_face/cloud_adapter/cdnetv1.py new file mode 100644 index 0000000..0811514 --- /dev/null +++ b/hugging_face/cloud_adapter/cdnetv1.py @@ -0,0 +1,389 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/7/24 上午11:36 +# @Author : xiaoshun +# @Email : 3038523973@qq.com +# @File : cdnetv1.py +# @Software: PyCharm + +"""Cloud detection Network""" + +"""Cloud detection Network""" + +""" +This is the implementation of CDnetV1 without multi-scale inputs. This implementation uses ResNet by default. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +affine_par = True + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, affine=affine_par) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, affine=affine_par) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change + self.bn1 = nn.BatchNorm2d(planes, affine=affine_par) + for i in self.bn1.parameters(): + i.requires_grad = False + + padding = dilation + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change + padding=padding, bias=False, dilation=dilation) + self.bn2 = nn.BatchNorm2d(planes, affine=affine_par) + for i in self.bn2.parameters(): + i.requires_grad = False + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4, affine=affine_par) + for i in self.bn3.parameters(): + i.requires_grad = False + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Classifier_Module(nn.Module): + + def __init__(self, dilation_series, padding_series, num_classes): + super(Classifier_Module, self).__init__() + self.conv2d_list = nn.ModuleList() + for dilation, padding in zip(dilation_series, padding_series): + self.conv2d_list.append( + nn.Conv2d(2048, num_classes, kernel_size=3, stride=1, padding=padding, dilation=dilation, bias=True)) + + for m in self.conv2d_list: + m.weight.data.normal_(0, 0.01) + + def forward(self, x): + out = self.conv2d_list[0](x) + for i in range(len(self.conv2d_list) - 1): + out += self.conv2d_list[i + 1](x) + return out + + +class _ConvBNReLU(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, + dilation=1, groups=1, norm_layer=nn.BatchNorm2d): + super(_ConvBNReLU, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=False) + self.bn = norm_layer(out_channels) + self.relu = nn.ReLU(True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + + +class _ASPPConv(nn.Module): + def __init__(self, in_channels, out_channels, atrous_rate, norm_layer): + super(_ASPPConv, self).__init__() + self.block = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 3, padding=atrous_rate, dilation=atrous_rate, bias=False), + norm_layer(out_channels), + nn.ReLU(True) + ) + + def forward(self, x): + return self.block(x) + + +class _AsppPooling(nn.Module): + def __init__(self, in_channels, out_channels, norm_layer): + super(_AsppPooling, self).__init__() + self.gap = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(in_channels, out_channels, 1, bias=False), + norm_layer(out_channels), + nn.ReLU(True) + ) + + def forward(self, x): + size = x.size()[2:] + pool = self.gap(x) + out = F.interpolate(pool, size, mode='bilinear', align_corners=True) + return out + + +class _ASPP(nn.Module): + def __init__(self, in_channels, atrous_rates, norm_layer): + super(_ASPP, self).__init__() + out_channels = 512 # changed from 256 + self.b0 = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1, bias=False), + norm_layer(out_channels), + nn.ReLU(True) + ) + + rate1, rate2, rate3 = tuple(atrous_rates) + self.b1 = _ASPPConv(in_channels, out_channels, rate1, norm_layer) + self.b2 = _ASPPConv(in_channels, out_channels, rate2, norm_layer) + self.b3 = _ASPPConv(in_channels, out_channels, rate3, norm_layer) + self.b4 = _AsppPooling(in_channels, out_channels, norm_layer=norm_layer) + + # self.project = nn.Sequential( + # nn.Conv2d(5 * out_channels, out_channels, 1, bias=False), + # norm_layer(out_channels), + # nn.ReLU(True), + # nn.Dropout(0.5)) + self.dropout2d = nn.Dropout2d(0.3) + + def forward(self, x): + feat1 = self.dropout2d(self.b0(x)) + feat2 = self.dropout2d(self.b1(x)) + feat3 = self.dropout2d(self.b2(x)) + feat4 = self.dropout2d(self.b3(x)) + feat5 = self.dropout2d(self.b4(x)) + x = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + # x = self.project(x) + return x + + +class _FPM(nn.Module): + def __init__(self, in_channels, num_classes, norm_layer=nn.BatchNorm2d): + super(_FPM, self).__init__() + self.aspp = _ASPP(in_channels, [6, 12, 18], norm_layer=norm_layer) + # self.dropout2d = nn.Dropout2d(0.5) + + def forward(self, x): + x = torch.cat((x, self.aspp(x)), dim=1) + # x = self.dropout2d(x) # added + return x + + +class BR(nn.Module): + def __init__(self, num_classes, stride=1, downsample=None): + super(BR, self).__init__() + self.conv1 = conv3x3(num_classes, num_classes * 16, stride) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(num_classes * 16, num_classes) + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.relu(out) + + out = self.conv2(out) + out += residual + + return out + + +class CDnetV1(nn.Module): + def __init__(self, in_channels=3,block=Bottleneck, layers=[3, 4, 6, 3], num_classes=21, aux=True): + self.inplanes = 64 + self.aux = aux + super().__init__() + # self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) + # self.bn1 = nn.BatchNorm2d(64, affine = affine_par) + + self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64, affine=affine_par) + self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(64, affine=affine_par) + self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(64, affine=affine_par) + + for i in self.bn1.parameters(): + i.requires_grad = False + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=True) # change + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4) + # self.layer5 = self._make_pred_layer(Classifier_Module, [6,12,18,24],[6,12,18,24],num_classes) + + self.res5_con1x1 = nn.Sequential( + nn.Conv2d(1024 + 2048, 512, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2d(512), + nn.ReLU(True) + ) + + self.fpm1 = _FPM(512, num_classes) + self.fpm2 = _FPM(512, num_classes) + self.fpm3 = _FPM(256, num_classes) + + self.br1 = BR(num_classes) + self.br2 = BR(num_classes) + self.br3 = BR(num_classes) + self.br4 = BR(num_classes) + self.br5 = BR(num_classes) + self.br6 = BR(num_classes) + self.br7 = BR(num_classes) + + self.predict1 = self._predict_layer(512 * 6, num_classes) + self.predict2 = self._predict_layer(512 * 6, num_classes) + self.predict3 = self._predict_layer(512 * 5 + 256, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, 0.01) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + # for i in m.parameters(): + # i.requires_grad = False + + def _predict_layer(self, in_channels, num_classes): + return nn.Sequential(nn.Conv2d(in_channels, 256, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2d(256), + nn.ReLU(True), + nn.Dropout2d(0.1), + nn.Conv2d(256, num_classes, kernel_size=3, stride=1, padding=1, bias=True)) + + def _make_layer(self, block, planes, blocks, stride=1, dilation=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion or dilation == 2 or dilation == 4: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion, affine=affine_par)) + for i in downsample._modules['1'].parameters(): + i.requires_grad = False + layers = [] + layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, dilation=dilation)) + + return nn.Sequential(*layers) + + # def _make_pred_layer(self,block, dilation_series, padding_series,num_classes): + # return block(dilation_series,padding_series,num_classes) + + def base_forward(self, x): + x = self.relu(self.bn1(self.conv1(x))) + size_conv1 = x.size()[2:] + x = self.relu(self.bn2(self.conv2(x))) + x = self.relu(self.bn3(self.conv3(x))) + x = self.maxpool(x) + x = self.layer1(x) + res2 = x + x = self.layer2(x) + res3 = x + x = self.layer3(x) + res4 = x + x = self.layer4(x) + x = self.res5_con1x1(torch.cat([x, res4], dim=1)) + + return x, res3, res2, size_conv1 + + def forward(self, x): + size = x.size()[2:] + score1, score2, score3, size_conv1 = self.base_forward(x) + # outputs = list() + score1 = self.fpm1(score1) + score1 = self.predict1(score1) # 1/8 + predict1 = score1 + score1 = self.br1(score1) + + score2 = self.fpm2(score2) + score2 = self.predict2(score2) # 1/8 + predict2 = score2 + + # first fusion + score2 = self.br2(score2) + score1 + score2 = self.br3(score2) + + score3 = self.fpm3(score3) + score3 = self.predict3(score3) # 1/4 + predict3 = score3 + score3 = self.br4(score3) + + # second fusion + size_score3 = score3.size()[2:] + score3 = score3 + F.interpolate(score2, size_score3, mode='bilinear', align_corners=True) + score3 = self.br5(score3) + + # upsampling + BR + score3 = F.interpolate(score3, size_conv1, mode='bilinear', align_corners=True) + score3 = self.br6(score3) + score3 = F.interpolate(score3, size, mode='bilinear', align_corners=True) + score3 = self.br7(score3) + + # if self.aux: + # auxout = self.dsn(mid) + # auxout = F.interpolate(auxout, size, mode='bilinear', align_corners=True) + # #outputs.append(auxout) + return score3 + # return score3, predict1, predict2, predict3 + + +if __name__ == '__main__': + model = CDnetV1(num_classes=21) + fake_image = torch.randn(2, 3, 224, 224) + outputs = model(fake_image) + for out in outputs: + print(out.shape) + # torch.Size([2, 21, 224, 224]) + # torch.Size([2, 21, 29, 29]) + # torch.Size([2, 21, 29, 29]) + # torch.Size([2, 21, 57, 57]) \ No newline at end of file diff --git a/hugging_face/cloud_adapter/cdnetv2.py b/hugging_face/cloud_adapter/cdnetv2.py new file mode 100644 index 0000000..e6fbdee --- /dev/null +++ b/hugging_face/cloud_adapter/cdnetv2.py @@ -0,0 +1,693 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/7/24 下午3:41 +# @Author : xiaoshun +# @Email : 3038523973@qq.com +# @File : cdnetv2.py +# @Software: PyCharm + +"""Cloud detection Network""" + +""" +This is the implementation of CDnetV2 without multi-scale inputs. This implementation uses ResNet by default. +""" +# nn.GroupNorm + +import torch +# import torch.nn as nn +import torch.nn.functional as F +from torch import nn + +affine_par = True + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, affine=affine_par) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, affine=affine_par) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change + self.bn1 = nn.BatchNorm2d(planes, affine=affine_par) + for i in self.bn1.parameters(): + i.requires_grad = False + + padding = dilation + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change + padding=padding, bias=False, dilation=dilation) + self.bn2 = nn.BatchNorm2d(planes, affine=affine_par) + for i in self.bn2.parameters(): + i.requires_grad = False + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4, affine=affine_par) + for i in self.bn3.parameters(): + i.requires_grad = False + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + # self.layerx_1 = Bottleneck_nosample(64, 64, stride=1, dilation=1) + # self.layerx_2 = Bottleneck(256, 64, stride=1, dilation=1, downsample=None) + # self.layerx_3 = Bottleneck_downsample(256, 64, stride=2, dilation=1) + + +class Res_block_1(nn.Module): + expansion = 4 + + def __init__(self, inplanes=64, planes=64, stride=1, dilation=1): + super(Res_block_1, self).__init__() + + self.conv1 = nn.Sequential( + nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, bias=False), + nn.GroupNorm(8, planes), + nn.ReLU(inplace=True)) + + self.conv2 = nn.Sequential( + nn.Conv2d(planes, planes, kernel_size=3, stride=1, + padding=1, bias=False, dilation=1), + nn.GroupNorm(8, planes), + nn.ReLU(inplace=True)) + + self.conv3 = nn.Sequential( + nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False), + nn.GroupNorm(8, planes * 4)) + + self.relu = nn.ReLU(inplace=True) + + self.down_sample = nn.Sequential( + nn.Conv2d(inplanes, planes * 4, + kernel_size=1, stride=1, bias=False), + nn.GroupNorm(8, planes * 4)) + + def forward(self, x): + # residual = x + + out = self.conv1(x) + out = self.conv2(out) + out = self.conv3(out) + residual = self.down_sample(x) + out += residual + out = self.relu(out) + + return out + + +class Res_block_2(nn.Module): + expansion = 4 + + def __init__(self, inplanes=256, planes=64, stride=1, dilation=1): + super(Res_block_2, self).__init__() + + self.conv1 = nn.Sequential( + nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, bias=False), + nn.GroupNorm(8, planes), + nn.ReLU(inplace=True)) + + self.conv2 = nn.Sequential( + nn.Conv2d(planes, planes, kernel_size=3, stride=1, + padding=1, bias=False, dilation=1), + nn.GroupNorm(8, planes), + nn.ReLU(inplace=True)) + + self.conv3 = nn.Sequential( + nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False), + nn.GroupNorm(8, planes * 4)) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.conv2(out) + out = self.conv3(out) + + out += residual + out = self.relu(out) + + return out + + +class Res_block_3(nn.Module): + expansion = 4 + + def __init__(self, inplanes=256, planes=64, stride=1, dilation=1): + super(Res_block_3, self).__init__() + + self.conv1 = nn.Sequential( + nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False), + nn.GroupNorm(8, planes), + nn.ReLU(inplace=True)) + + self.conv2 = nn.Sequential( + nn.Conv2d(planes, planes, kernel_size=3, stride=1, + padding=1, bias=False, dilation=1), + nn.GroupNorm(8, planes), + nn.ReLU(inplace=True)) + + self.conv3 = nn.Sequential( + nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False), + nn.GroupNorm(8, planes * 4)) + + self.relu = nn.ReLU(inplace=True) + + self.downsample = nn.Sequential( + nn.Conv2d(inplanes, planes * 4, + kernel_size=1, stride=stride, bias=False), + nn.GroupNorm(8, planes * 4)) + + def forward(self, x): + # residual = x + + out = self.conv1(x) + out = self.conv2(out) + out = self.conv3(out) + # residual = self.downsample(x) + out += self.downsample(x) + out = self.relu(out) + + return out + + +class Classifier_Module(nn.Module): + + def __init__(self, dilation_series, padding_series, num_classes): + super(Classifier_Module, self).__init__() + self.conv2d_list = nn.ModuleList() + for dilation, padding in zip(dilation_series, padding_series): + self.conv2d_list.append( + nn.Conv2d(2048, num_classes, kernel_size=3, stride=1, padding=padding, dilation=dilation, bias=True)) + + for m in self.conv2d_list: + m.weight.data.normal_(0, 0.01) + + def forward(self, x): + out = self.conv2d_list[0](x) + for i in range(len(self.conv2d_list) - 1): + out += self.conv2d_list[i + 1](x) + return out + + +class _ConvBNReLU(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, + dilation=1, groups=1, relu6=False, norm_layer=nn.BatchNorm2d): + super(_ConvBNReLU, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=False) + self.bn = norm_layer(out_channels) + self.relu = nn.ReLU6(True) if relu6 else nn.ReLU(True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + + +class _ASPPConv(nn.Module): + def __init__(self, in_channels, out_channels, atrous_rate, norm_layer): + super(_ASPPConv, self).__init__() + self.block = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 3, padding=atrous_rate, dilation=atrous_rate, bias=False), + norm_layer(out_channels), + nn.ReLU(True) + ) + + def forward(self, x): + return self.block(x) + + +class _AsppPooling(nn.Module): + def __init__(self, in_channels, out_channels, norm_layer): + super(_AsppPooling, self).__init__() + self.gap = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(in_channels, out_channels, 1, bias=False), + norm_layer(out_channels), + nn.ReLU(True) + ) + + def forward(self, x): + size = x.size()[2:] + pool = self.gap(x) + out = F.interpolate(pool, size, mode='bilinear', align_corners=True) + return out + + +class _ASPP(nn.Module): + def __init__(self, in_channels, atrous_rates, norm_layer): + super(_ASPP, self).__init__() + out_channels = 256 + self.b0 = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1, bias=False), + norm_layer(out_channels), + nn.ReLU(True) + ) + + rate1, rate2, rate3 = tuple(atrous_rates) + self.b1 = _ASPPConv(in_channels, out_channels, rate1, norm_layer) + self.b2 = _ASPPConv(in_channels, out_channels, rate2, norm_layer) + self.b3 = _ASPPConv(in_channels, out_channels, rate3, norm_layer) + self.b4 = _AsppPooling(in_channels, out_channels, norm_layer=norm_layer) + + self.project = nn.Sequential( + nn.Conv2d(5 * out_channels, out_channels, 1, bias=False), + norm_layer(out_channels), + nn.ReLU(True), + nn.Dropout(0.5) + ) + + def forward(self, x): + feat1 = self.b0(x) + feat2 = self.b1(x) + feat3 = self.b2(x) + feat4 = self.b3(x) + feat5 = self.b4(x) + x = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + x = self.project(x) + return x + + +class _DeepLabHead(nn.Module): + def __init__(self, num_classes, c1_channels=256, norm_layer=nn.BatchNorm2d): + super(_DeepLabHead, self).__init__() + self.aspp = _ASPP(2048, [12, 24, 36], norm_layer=norm_layer) + self.c1_block = _ConvBNReLU(c1_channels, 48, 3, padding=1, norm_layer=norm_layer) + self.block = nn.Sequential( + _ConvBNReLU(304, 256, 3, padding=1, norm_layer=norm_layer), + nn.Dropout(0.5), + _ConvBNReLU(256, 256, 3, padding=1, norm_layer=norm_layer), + nn.Dropout(0.1), + nn.Conv2d(256, num_classes, 1)) + + def forward(self, x, c1): + size = c1.size()[2:] + c1 = self.c1_block(c1) + x = self.aspp(x) + x = F.interpolate(x, size, mode='bilinear', align_corners=True) + return self.block(torch.cat([x, c1], dim=1)) + + +class _CARM(nn.Module): + def __init__(self, in_planes, ratio=8): + super(_CARM, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.max_pool = nn.AdaptiveMaxPool2d(1) + + self.fc1_1 = nn.Linear(in_planes, in_planes // ratio) + self.fc1_2 = nn.Linear(in_planes // ratio, in_planes) + + self.fc2_1 = nn.Linear(in_planes, in_planes // ratio) + self.fc2_2 = nn.Linear(in_planes // ratio, in_planes) + self.relu = nn.ReLU(True) + + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + avg_out = self.avg_pool(x) + avg_out = avg_out.view(avg_out.size(0), -1) + avg_out = self.fc1_2(self.relu(self.fc1_1(avg_out))) + + max_out = self.max_pool(x) + max_out = max_out.view(max_out.size(0), -1) + max_out = self.fc2_2(self.relu(self.fc2_1(max_out))) + + max_out_size = max_out.size()[1] + avg_out = torch.reshape(avg_out, (-1, max_out_size, 1, 1)) + max_out = torch.reshape(max_out, (-1, max_out_size, 1, 1)) + + out = self.sigmoid(avg_out + max_out) + + x = out * x + return x + + +class FSFB_CH(nn.Module): + def __init__(self, in_planes, num, ratio=8): + super(FSFB_CH, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.max_pool = nn.AdaptiveMaxPool2d(1) + + self.fc1_1 = nn.Linear(in_planes, in_planes // ratio) + self.fc1_2 = nn.Linear(in_planes // ratio, num * in_planes) + + self.fc2_1 = nn.Linear(in_planes, in_planes // ratio) + self.fc2_2 = nn.Linear(in_planes // ratio, num * in_planes) + self.relu = nn.ReLU(True) + + self.fc3 = nn.Linear(num * in_planes, 2 * num * in_planes) + self.fc4 = nn.Linear(2 * num * in_planes, 2 * num * in_planes) + self.fc5 = nn.Linear(2 * num * in_planes, num * in_planes) + + self.softmax = nn.Softmax(dim=3) + + def forward(self, x, num): + avg_out = self.avg_pool(x) + avg_out = avg_out.view(avg_out.size(0), -1) + avg_out = self.fc1_2(self.relu(self.fc1_1(avg_out))) + + max_out = self.max_pool(x) + max_out = max_out.view(max_out.size(0), -1) + max_out = self.fc2_2(self.relu(self.fc2_1(max_out))) + + out = avg_out + max_out + out = self.relu(self.fc3(out)) + out = self.relu(self.fc4(out)) + out = self.relu(self.fc5(out)) # (N, num*in_planes) + + out_size = out.size()[1] + out = torch.reshape(out, (-1, out_size // num, 1, num)) # (N, in_planes, 1, num ) + out = self.softmax(out) + + channel_scale = torch.chunk(out, num, dim=3) # (N, in_planes, 1, 1 ) + + return channel_scale + + +class FSFB_SP(nn.Module): + def __init__(self, num, norm_layer=nn.BatchNorm2d): + super(FSFB_SP, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d(2, 2 * num, kernel_size=3, padding=1, bias=False), + norm_layer(2 * num), + nn.ReLU(True), + nn.Conv2d(2 * num, 4 * num, kernel_size=3, padding=1, bias=False), + norm_layer(4 * num), + nn.ReLU(True), + nn.Conv2d(4 * num, 4 * num, kernel_size=3, padding=1, bias=False), + norm_layer(4 * num), + nn.ReLU(True), + nn.Conv2d(4 * num, 2 * num, kernel_size=3, padding=1, bias=False), + norm_layer(2 * num), + nn.ReLU(True), + nn.Conv2d(2 * num, num, kernel_size=3, padding=1, bias=False) + ) + self.softmax = nn.Softmax(dim=1) + + def forward(self, x, num): + avg_out = torch.mean(x, dim=1, keepdim=True) + max_out, _ = torch.max(x, dim=1, keepdim=True) + x = torch.cat([avg_out, max_out], dim=1) + x = self.conv(x) + x = self.softmax(x) + spatial_scale = torch.chunk(x, num, dim=1) + return spatial_scale + + +################################################################################################################## + + +class _HFFM(nn.Module): + def __init__(self, in_channels, atrous_rates, norm_layer=nn.BatchNorm2d): + super(_HFFM, self).__init__() + out_channels = 256 + self.b0 = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1, bias=False), + norm_layer(out_channels), + nn.ReLU(True) + ) + + rate1, rate2, rate3 = tuple(atrous_rates) + self.b1 = _ASPPConv(in_channels, out_channels, rate1, norm_layer) + self.b2 = _ASPPConv(in_channels, out_channels, rate2, norm_layer) + self.b3 = _ASPPConv(in_channels, out_channels, rate3, norm_layer) + self.b4 = _AsppPooling(in_channels, out_channels, norm_layer=norm_layer) + self.carm = _CARM(in_channels) + self.sa = FSFB_SP(4, norm_layer) + self.ca = FSFB_CH(out_channels, 4, 8) + + def forward(self, x, num): + x = self.carm(x) + # feat1 = self.b0(x) + feat1 = self.b1(x) + feat2 = self.b2(x) + feat3 = self.b3(x) + feat4 = self.b4(x) + feat = feat1 + feat2 + feat3 + feat4 + spatial_atten = self.sa(feat, num) + channel_atten = self.ca(feat, num) + + feat_ca = channel_atten[0] * feat1 + channel_atten[1] * feat2 + channel_atten[2] * feat3 + channel_atten[ + 3] * feat4 + feat_sa = spatial_atten[0] * feat1 + spatial_atten[1] * feat2 + spatial_atten[2] * feat3 + spatial_atten[ + 3] * feat4 + feat_sa = feat_sa + feat_ca + + return feat_sa + + +class _AFFM(nn.Module): + def __init__(self, in_channels=256, norm_layer=nn.BatchNorm2d): + super(_AFFM, self).__init__() + + self.sa = FSFB_SP(2, norm_layer) + self.ca = FSFB_CH(in_channels, 2, 8) + self.carm = _CARM(in_channels) + + def forward(self, feat1, feat2, hffm, num): + feat = feat1 + feat2 + spatial_atten = self.sa(feat, num) + channel_atten = self.ca(feat, num) + + feat_ca = channel_atten[0] * feat1 + channel_atten[1] * feat2 + feat_sa = spatial_atten[0] * feat1 + spatial_atten[1] * feat2 + output = self.carm(feat_sa + feat_ca + hffm) + # output = self.carm (feat_sa + hffm) + + return output, channel_atten, spatial_atten + + +class block_Conv3x3(nn.Module): + def __init__(self, in_channels): + super(block_Conv3x3, self).__init__() + self.block = nn.Sequential( + nn.Conv2d(in_channels, 256, kernel_size=3, stride=1, padding=1, bias=False), + nn.BatchNorm2d(256), + nn.ReLU(True) + ) + + def forward(self, x): + return self.block(x) + + +class CDnetV2(nn.Module): + def __init__(self, in_channels=3,block=Bottleneck, layers=[3, 4, 6, 3], num_classes=21, aux=True): + self.inplanes = 256 # change + self.aux = aux + super().__init__() + # self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) + # self.bn1 = nn.BatchNorm2d(64, affine = affine_par) + + self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64, affine=affine_par) + + self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(64, affine=affine_par) + + self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(64, affine=affine_par) + + self.relu = nn.ReLU(inplace=True) + + self.dropout = nn.Dropout(0.3) + for i in self.bn1.parameters(): + i.requires_grad = False + + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=True) # change + + # self.layer1 = self._make_layer(block, 64, layers[0]) + + self.layerx_1 = Res_block_1(64, 64, stride=1, dilation=1) + self.layerx_2 = Res_block_2(256, 64, stride=1, dilation=1) + self.layerx_3 = Res_block_3(256, 64, stride=2, dilation=1) + + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4) + # self.layer5 = self._make_pred_layer(Classifier_Module, [6,12,18,24],[6,12,18,24],num_classes) + + self.hffm = _HFFM(2048, [6, 12, 18]) + self.affm_1 = _AFFM() + self.affm_2 = _AFFM() + self.affm_3 = _AFFM() + self.affm_4 = _AFFM() + self.carm = _CARM(256) + + self.con_layer1_1 = block_Conv3x3(256) + self.con_res2 = block_Conv3x3(256) + self.con_res3 = block_Conv3x3(512) + self.con_res4 = block_Conv3x3(1024) + self.con_res5 = block_Conv3x3(2048) + + self.dsn1 = nn.Sequential( + nn.Conv2d(256, num_classes, kernel_size=1, stride=1, padding=0) + ) + + self.dsn2 = nn.Sequential( + nn.Conv2d(256, num_classes, kernel_size=1, stride=1, padding=0) + ) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, 0.01) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + # for i in m.parameters(): + # i.requires_grad = False + + # self.inplanes = 256 # change + + def _make_layer(self, block, planes, blocks, stride=1, dilation=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion or dilation == 2 or dilation == 4: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion, affine=affine_par)) + for i in downsample._modules['1'].parameters(): + i.requires_grad = False + layers = [] + layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, dilation=dilation)) + + return nn.Sequential(*layers) + + # def _make_pred_layer(self,block, dilation_series, padding_series,num_classes): + # return block(dilation_series,padding_series,num_classes) + + def base_forward(self, x): + x = self.relu(self.bn1(self.conv1(x))) # 1/2 + x = self.relu(self.bn2(self.conv2(x))) + x = self.relu(self.bn3(self.conv3(x))) + x = self.maxpool(x) # 1/4 + + # x = self.layer1(x) # 1/8 + + # layer1 + x = self.layerx_1(x) # 1/4 + layer1_0 = x + + x = self.layerx_2(x) # 1/4 + layer1_0 = self.con_layer1_1(x + layer1_0) # 256 + size_layer1_0 = layer1_0.size()[2:] + + x = self.layerx_3(x) # 1/8 + res2 = self.con_res2(x) # 256 + size_res2 = res2.size()[2:] + + # layer2-4 + x = self.layer2(x) # 1/16 + res3 = self.con_res3(x) # 256 + x = self.layer3(x) # 1/16 + + res4 = self.con_res4(x) # 256 + x = self.layer4(x) # 1/16 + res5 = self.con_res5(x) # 256 + + # x = self.res5_con1x1(torch.cat([x, res4], dim=1)) + return layer1_0, res2, res3, res4, res5, x, size_layer1_0, size_res2 + + # return res2, res3, res4, res5, x, layer_1024, size_res2 + + def forward(self, x): + # size = x.size()[2:] + layer1_0, res2, res3, res4, res5, layer4, size_layer1_0, size_res2 = self.base_forward(x) + + hffm = self.hffm(layer4, 4) # 256 HFFM + res5 = res5 + hffm + aux_feature = res5 # loss_aux + # res5 = self.carm(res5) + res5, _, _ = self.affm_1(res4, res5, hffm, 2) # 1/16 + # aux_feature = res5 + res5, _, _ = self.affm_2(res3, res5, hffm, 2) # 1/16 + + res5 = F.interpolate(res5, size_res2, mode='bilinear', align_corners=True) + res5, _, _ = self.affm_3(res2, res5, F.interpolate(hffm, size_res2, mode='bilinear', align_corners=True), 2) + + res5 = F.interpolate(res5, size_layer1_0, mode='bilinear', align_corners=True) + res5, _, _ = self.affm_4(layer1_0, res5, + F.interpolate(hffm, size_layer1_0, mode='bilinear', align_corners=True), 2) + + output = self.dsn1(res5) + + if self.aux: + auxout = self.dsn2(aux_feature) + # auxout = F.interpolate(auxout, size, mode='bilinear', align_corners=True) + # outputs.append(auxout) + size = x.size()[2:] + pred, pred_aux = output, auxout + pred = F.interpolate(pred, size, mode='bilinear', align_corners=True) + pred_aux = F.interpolate(pred_aux, size, mode='bilinear', align_corners=True) + return pred + return pred, pred_aux + + +if __name__ == '__main__': + model = CDnetV2(num_classes=3) + fake_image = torch.rand(2, 3, 256, 256) + output = model(fake_image) + for out in output: + print(out.shape) + # torch.Size([2, 3, 256, 256]) + # torch.Size([2, 3, 256, 256]) \ No newline at end of file diff --git a/hugging_face/cloud_adapter/cloud_adapter.py b/hugging_face/cloud_adapter/cloud_adapter.py new file mode 100644 index 0000000..782b87d --- /dev/null +++ b/hugging_face/cloud_adapter/cloud_adapter.py @@ -0,0 +1,590 @@ +import torch +from torch import nn +from einops import rearrange +from torch import nn, einsum +from einops import rearrange +from mmseg.models.builder import MODELS +import math +import torch +from torch import nn as nn +from mmseg.models.builder import MODELS +from timm.layers import DropPath, trunc_normal_ +from typing import List +from timm.layers import create_act_layer +from functools import partial +import torch.nn.functional as F + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from timm.layers import CondConv2d, get_condconv_initializer, create_conv2d, DropPath, get_norm_act_layer + + +class LoRaMLP(nn.Module): + def __init__(self, in_dim, out_dim, rank_dim=8): + super().__init__() + self.loramlp = nn.Sequential( + nn.Linear(in_dim, rank_dim, bias=False), + nn.Linear(rank_dim, out_dim, bias=False), + ) + + def forward(self, x): + return self.loramlp(x) + + +class CrossAttention(nn.Module): + def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, rank_dim=None): + super().__init__() + inner_dim = dim_head * heads # 512 + context_dim = query_dim if context_dim is None else context_dim + + self.scale = dim_head ** -0.5 + self.heads = heads + + if not rank_dim: + self.to_q = nn.Linear(query_dim, inner_dim, bias=False) + self.to_k = nn.Linear(context_dim, inner_dim, bias=False) + self.to_v = nn.Linear(context_dim, inner_dim, bias=False) + + self.to_out = nn.Linear(inner_dim, query_dim, bias=False) + else: + self.to_q = LoRaMLP(query_dim, inner_dim, rank_dim=rank_dim) + self.to_k = LoRaMLP(context_dim, inner_dim, rank_dim=rank_dim) + self.to_v = LoRaMLP(context_dim, inner_dim, rank_dim=rank_dim) + + self.to_out = LoRaMLP(inner_dim, query_dim, rank_dim=rank_dim) + + def forward(self, x, context): + h = self.heads + + q = self.to_q(x) + k = self.to_k(context) + v = self.to_v(context) + + q, k, v = map(lambda t: rearrange( + t, 'b n (h d) -> (b h) n d', h=h), (q, k, v)) + + sim = einsum('b i d, b j d -> b i j', q, k) * self.scale + + attn = sim.softmax(dim=-1) + + out = einsum('b i j, b j d -> b i d', attn, v) + out = rearrange(out, '(b h) n d -> b n (h d)', h=h) + + return self.to_out(out) + + +def num_groups(group_size, channels): + if not group_size: + return 1 + else: + assert channels % group_size == 0 + return channels // group_size + + +def _init_weight_goog(m, n='', fix_group_fanout=True): + if isinstance(m, CondConv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + if fix_group_fanout: + fan_out //= m.groups + init_weight_fn = get_condconv_initializer( + lambda w: nn.init.normal_(w, 0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape) + init_weight_fn(m.weight) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + if fix_group_fanout: + fan_out //= m.groups + nn.init.normal_(m.weight, 0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + fan_out = m.weight.size(0) + fan_in = 0 + if 'routing_fn' in n: + fan_in = m.weight.size(1) + init_range = 1.0 / math.sqrt(fan_in + fan_out) + nn.init.uniform_(m.weight, -init_range, init_range) + if m.bias is not None: + nn.init.zeros_(m.bias) + + +class DepthwiseSeparableConv(nn.Module): + def __init__( + self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='', + noskip=False, pw_kernel_size=1, pw_act=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + se_layer=None, drop_path_rate=0.): + super(DepthwiseSeparableConv, self).__init__() + norm_act_layer = get_norm_act_layer(norm_layer) + groups = num_groups(group_size, in_chs) + self.has_skip = (stride == 1 and in_chs == out_chs) and not noskip + self.has_pw_act = pw_act + + self.conv_dw = create_conv2d( + in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, groups=groups) + self.bn1 = norm_act_layer(in_chs, inplace=True) + + self.se = se_layer( + in_chs, act_layer=act_layer) if se_layer else nn.Identity() + + self.conv_pw = create_conv2d( + in_chs, out_chs, pw_kernel_size, padding=pad_type) + self.bn2 = norm_act_layer( + out_chs, inplace=True, apply_act=self.has_pw_act) + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate else nn.Identity() + + def feature_info(self, location): + if location == 'expansion': + return dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels) + else: + return dict(module='', hook_type='', num_chs=self.conv_pw.out_channels) + + def forward(self, x): + shortcut = x + x = self.conv_dw(x) + x = self.bn1(x) + x = self.se(x) + x = self.conv_pw(x) + x = self.bn2(x) + if self.has_skip: + x = self.drop_path(x) + shortcut + return x + + +class PMAAConvBlock(nn.Module): + def __init__(self, in_channels=3, hidden_channels=256, depth=4, norm=nn.BatchNorm2d, act=nn.ReLU, return_multi_feats=False, return_last_feature=True, has_stem=True, has_block=True): + super().__init__() + self.return_last_feature = return_last_feature + self.depth = depth + self.has_stem = has_stem + self.return_multi_feats = return_multi_feats + + self.proj_1x1 = DepthwiseSeparableConv( + in_channels, hidden_channels, dw_kernel_size=1, norm_layer=norm, act_layer=act) + + self.spp_dw = nn.ModuleList() + + if has_stem: + self.spp_dw.append( + DepthwiseSeparableConv(hidden_channels, hidden_channels, dw_kernel_size=3, + stride=1, group_size=hidden_channels, pad_type="same") + ) + else: + self.spp_dw.append(nn.Identity()) + + if has_block: + for _ in range(self.depth): + self.spp_dw.append( + DepthwiseSeparableConv( + hidden_channels, hidden_channels, dw_kernel_size=3, stride=2, group_size=hidden_channels + ) + ) + else: + for _ in range(self.depth): + self.spp_dw.append( + nn.MaxPool2d(kernel_size=2, stride=2) + ) + self._init_weights() + + def forward(self, x): + B, C, H, W = x.shape + output1 = self.proj_1x1(x) + output = [self.spp_dw[0](output1)] + + for k in range(1, self.depth+1): + out_k = self.spp_dw[k](output[-1]) + output.append(out_k) + + if self.return_multi_feats: + return output[1:] + else: + if self.return_last_feature: + return output[-1] + global_f = torch.zeros( + output[-1].shape, requires_grad=True, device=output1.device) + for fea in output: + global_f = global_f + F.adaptive_avg_pool2d( + fea, output_size=output[-1].shape[-2:] + ) + return global_f + + def _init_weights(self): + init_fn = _init_weight_goog + for n, m in self.named_modules(): + init_fn(m, n) + + +class ConvnextInteractiveModule(nn.Module): + def __init__(self, emd_dim=1024, context_dim=256, rank_dim=None): + super().__init__() + self.attn = CrossAttention(emd_dim, context_dim, rank_dim=rank_dim) + + def forward(self, x, cache, index): + # x: 1024 2 1024 + if isinstance(cache, list) or isinstance(cache, tuple): + # len(cache) 4 cache[4]-23 + # 0-5->0 6-11 -> 1 12-17->2 18-23->3 + cache = cache[index] + cache = F.interpolate( + cache, (int(math.sqrt(x.shape[0])), int(math.sqrt(x.shape[0]))), mode="bilinear", align_corners=False + ) + cache = cache.flatten(2) # B C N + cache = cache.permute(2, 0, 1) # N B C + + # Reshape: batch first + x = x.permute(1, 0, 2) # B N C + cache = cache.permute(1, 0, 2) # B N C + return (x + self.attn(x, cache)).permute(1, 0, 2) + + +class PMAAInteractiveModule(nn.Module): + def __init__(self, + emd_dim=1024, + context_dim=64, + kernel: int = 1, + norm=nn.BatchNorm2d, + local_groups=32, + global_groups=2, + return_multi_feats=False, + ): + super().__init__() + self.return_multi_feats = return_multi_feats + self.local_embedding = nn.Sequential( + nn.Conv2d(emd_dim, emd_dim, kernel, groups=local_groups, + padding=int((kernel - 1) / 2), bias=False), + norm(emd_dim) + ) + self.global_embedding = nn.Sequential( + nn.Conv2d(context_dim, emd_dim, kernel, groups=global_groups, + padding=int((kernel - 1) / 2), bias=False), + norm(emd_dim) + ) + self.global_act = nn.Sequential( + nn.Conv2d(context_dim, emd_dim, kernel, groups=global_groups, + padding=int((kernel - 1) / 2), bias=False), + norm(emd_dim) + ) + self.act = nn.Sigmoid() + self._init_weights() + + def _init_weights(self): + init_fn = _init_weight_goog + for n, m in self.named_modules(): + init_fn(m, n) + + def forward(self, x, cache, index): + if isinstance(cache, list) or isinstance(cache, tuple): + cache = cache[index] + N, B, C = x.shape + H = W = int(math.sqrt(N)) + # reshape x -> B, C, H, W + x = x.permute(1, 2, 0).reshape(B, C, H, W) + local_feat = self.local_embedding(x) # 32 + global_act = self.global_act(cache) + sig_act = F.interpolate(self.act(global_act), size=(H, W)) # 32 + + global_feat = self.global_embedding(cache) + global_feat = F.interpolate(global_feat, size=(H, W)) # 32 + + out = local_feat * sig_act + global_feat + + return out.permute(2, 3, 0, 1).reshape(N, B, C) + + +class LayerNorm(nn.Module): + r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with + shape (batch_size, height, width, channels) while channels_first corresponds to inputs + with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError + self.normalized_shape = (normalized_shape, ) + + def forward(self, x): + if self.data_format == "channels_last": + return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + elif self.data_format == "channels_first": + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +class Block(nn.Module): + r""" ConvNeXt Block. There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + We use (2) as we find it slightly faster in PyTorch + + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + + def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): + super().__init__() + self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, + padding=3, groups=dim) # depthwise conv + self.norm = LayerNorm(dim, eps=1e-6) + # pointwise/1x1 convs, implemented with linear layers + self.pwconv1 = nn.Linear(dim, 4 * dim) + self.act = nn.GELU() + self.pwconv2 = nn.Linear(4 * dim, dim) + self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), + requires_grad=True) if layer_scale_init_value > 0 else None + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + input = x + x = self.dwconv(x) + x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + x = input + self.drop_path(x) + return x + + +class ConvNeXt(nn.Module): + r""" ConvNeXt + A PyTorch impl of : `A ConvNet for the 2020s` - + https://arxiv.org/pdf/2201.03545.pdf + + Args: + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] + dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] + drop_path_rate (float): Stochastic depth rate. Default: 0. + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. + """ + + def __init__(self, in_chans=3, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], + drop_path_rate=0., layer_scale_init_value=1e-6, out_indices=[0, 1, 2, 3], + return_multi_feats=False, + return_last_feature=True + ): + super().__init__() + self.return_last_feature = return_last_feature + self.return_multi_feats = return_multi_feats + + # stem and 3 intermediate downsampling conv layers + self.downsample_layers = nn.ModuleList() + stem = nn.Sequential( + nn.Conv2d(in_chans, dims[0], kernel_size=2, stride=2), + LayerNorm(dims[0], eps=1e-6, data_format="channels_first") + ) + self.downsample_layers.append(stem) + for i in range(3): + downsample_layer = nn.Sequential( + LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), + nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2), + ) + self.downsample_layers.append(downsample_layer) + + # 4 feature resolution stages, each consisting of multiple residual blocks + self.stages = nn.ModuleList() + dp_rates = [x.item() + for x in torch.linspace(0, drop_path_rate, sum(depths))] + cur = 0 + for i in range(4): + stage = nn.Sequential( + *[Block(dim=dims[i], drop_path=dp_rates[cur + j], + layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])] + ) + self.stages.append(stage) + cur += depths[i] + + self.out_indices = out_indices + + norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first") + for i_layer in range(4): + layer = norm_layer(dims[i_layer]) + layer_name = f'norm{i_layer}' + self.add_module(layer_name, layer) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, (nn.Conv2d, nn.Linear)): + trunc_normal_(m.weight, std=.02) + nn.init.constant_(m.bias, 0) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + if isinstance(pretrained, str): + self.apply(_init_weights) + # logger = get_root_logger() + # load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + self.apply(_init_weights) + else: + raise TypeError('pretrained must be a str or None') + + def forward_features(self, x): + outs = [] + for i in range(4): + x = self.downsample_layers[i](x) + x = self.stages[i](x) + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + x_out = norm_layer(x) + outs.append(x_out) + if self.return_multi_feats: + return tuple(outs) + if self.return_last_feature: + return outs[-1] + global_f = torch.zeros( + outs[-1].shape, requires_grad=True, device=outs[-1].device) + for fea in outs: + global_f = global_f + F.adaptive_avg_pool2d( + fea, output_size=outs[-1].shape[-2:] + ) + return global_f + + def forward(self, x): + x = self.forward_features(x) + return x + + +class NoAdaptingModule(nn.Identity): + def __init__(self): + super().__init__() + + def forward(self, x, cache, layer): + return x + + +@MODELS.register_module() +class CloudAdapter(nn.Module): + def __init__(self, + cnn_type="convnext", # convnext or mobilenet + int_type="convnext", # cross_attention or + # 共同的参数 start + emd_dim=1024, + num_layers=24, + + # 先判断是否返回多特征,之后再判断是否进行特征融合 + return_multi_feats=True, + return_last_feature=False, + + # 共同的参数 end + + # pmaa 提取单个特征 or 多尺寸特征 start + hidden_channels=256, + depth=4, + norm=nn.BatchNorm2d, + act=nn.ReLU, + # pmaa 提取单个特征 or 多尺寸特征 end + + # pmaa net start + local_groups=1, + global_groups=1, + # pmaa net end + + # convnext 提取单个特征 or 多尺寸特征 start + context_dim=256, + rank_dim=None, + # convnext 提取单个特征 or 多尺寸特征 end, + has_stem=True, + has_block=True, + ): + super().__init__() + self.cnn = nn.Identity() + self.net = nn.Identity() + if cnn_type == "pmaa": + self.cnn = PMAAConvBlock( + hidden_channels=hidden_channels, + depth=depth, + norm=norm, + act=act, + return_multi_feats=return_multi_feats, + return_last_feature=return_last_feature, + has_stem=has_stem, + has_block=has_block + ) + elif cnn_type == "convnext": + self.cnn = ConvNeXt(depths=[1]*4, + dims=[context_dim]*4, + return_multi_feats=return_multi_feats, + return_last_feature=return_last_feature + ) + + else: + raise ValueError( + f"cnn_type must in ['convnext','pmaa'],but got {cnn_type}") + + if int_type == "convnext": + self.net = nn.ModuleList( + ConvnextInteractiveModule(emd_dim, context_dim, rank_dim) + for _ in range(num_layers) + ) + elif int_type == "pmaa": + self.net = nn.ModuleList( + PMAAInteractiveModule( + emd_dim, context_dim, local_groups=local_groups, global_groups=global_groups) + for _ in range(num_layers) + ) + + elif int_type == "no_adapting": + self.net = nn.ModuleList( + NoAdaptingModule() for _ in range(num_layers) + ) + else: + raise ValueError( + f"int_type must in ['convnext','pmaa'],but got {int_type}") + + def forward(self, feats, layer, batch_first=True, has_cls_token=True, cache=None): + if batch_first: + feats = feats.permute(1, 0, 2) # 1025 2 1024 + if has_cls_token: + cls_token, feats = torch.tensor_split(feats, [1], dim=0) + # 24 // 1 + # feat: 1024 2 1024 + feats = self.net[layer].forward( + feats, cache, layer//(len(self.net) // 4)) + + if has_cls_token: + feats = torch.cat([cls_token, feats], dim=0) + if batch_first: + feats = feats.permute(1, 0, 2) + return feats + diff --git a/hugging_face/cloud_adapter/cloud_adapter_dinov2.py b/hugging_face/cloud_adapter/cloud_adapter_dinov2.py new file mode 100644 index 0000000..0c8fadb --- /dev/null +++ b/hugging_face/cloud_adapter/cloud_adapter_dinov2.py @@ -0,0 +1,115 @@ +from mmseg.models.builder import BACKBONES, MODELS +from torch import nn as nn +from .cloud_adapter import CloudAdapter +from .dino_v2 import DinoVisionTransformer +from .utils import set_requires_grad, set_train +import torch +import torch.nn.functional as F + + +@BACKBONES.register_module() +class CloudAdapterDinoVisionTransformer(DinoVisionTransformer): + def __init__( + self, + cloud_adapter_config=None, + has_cat=False, + # [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, ], + adapter_index=[0, 6, 12, 18], # Transformer Block 的索引 + **kwargs, + ): + super().__init__(**kwargs) + self.cloud_adapter: CloudAdapter = MODELS.build(cloud_adapter_config) + self.has_cat = has_cat + self.adapter_index = adapter_index + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + cache = self.cloud_adapter.cnn(x) # 得到多尺度特征或者单个特征 + H, W = h // self.patch_size, w // self.patch_size + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + cur_idx = 0 # 交互模块的索引 + for idx, blk in enumerate(self.blocks): + x = blk(x) + if idx in self.adapter_index: + x = self.cloud_adapter.forward( + x, + cur_idx, + batch_first=True, + has_cls_token=True, + cache=cache, + ) + cur_idx += 1 + if idx in self.out_indices: + outs.append( + x[:, 1:, :].permute(0, 2, 1).reshape( + B, -1, H, W).contiguous() + ) + return outs, cache + + def process_cache(self,ret,cache): + cache = F.interpolate( + cache,size=(ret.shape[-2],ret.shape[-1]),mode="bilinear",align_corners=False) + return cache + + def forward(self, *args, **kwargs): + ret, cache = self.forward_features(*args, **kwargs) + if isinstance(ret[0], torch.Tensor): + ret[0] = F.interpolate( + ret[0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[1] = F.interpolate( + ret[1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[3] = F.interpolate( + ret[3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + if self.has_cat: + if isinstance(cache,tuple) or isinstance(cache,list): + ret[0] = torch.cat((ret[0], cache[0]), dim=1) + ret[1] = torch.cat((ret[1], cache[1]), dim=1) + ret[2] = torch.cat((ret[2], cache[2]), dim=1) + ret[3] = torch.cat((ret[3], cache[3]), dim=1) + else: + ret[0] = torch.cat((ret[0], self.process_cache(ret[0],cache)), dim=1) + ret[1] = torch.cat((ret[1], self.process_cache(ret[1],cache)), dim=1) + ret[2] = torch.cat((ret[2], self.process_cache(ret[2],cache)), dim=1) + ret[3] = torch.cat((ret[3], self.process_cache(ret[3],cache)), dim=1) + # ret[0] = torch.cat(ret[0], cache[0], dim=1) # bs 1024 128 128, bs 256 128 128 + else: + ret[0][0] = F.interpolate( + ret[0][0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[0][1] = F.interpolate( + ret[0][1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[0][3] = F.interpolate( + ret[0][3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + if self.has_cat: + if isinstance(cache,tuple) or isinstance(cache,list): + ret[0][0] = torch.cat((ret[0][0], cache[0]), dim=1) + ret[0][1] = torch.cat((ret[0][1], cache[1]), dim=1) + ret[0][2] = torch.cat((ret[0][2], cache[2]), dim=1) + ret[0][3] = torch.cat((ret[0][3], cache[3]), dim=1) + else: + ret[0][0] = torch.cat((ret[0][0], self.process_cache(ret[0][0],cache)), dim=1) + ret[0][1] = torch.cat((ret[0][1], self.process_cache(ret[0][1],cache)), dim=1) + ret[0][2] = torch.cat((ret[0][2], self.process_cache(ret[0][2],cache)), dim=1) + ret[0][3] = torch.cat((ret[0][3], self.process_cache(ret[0][3],cache)), dim=1) + return ret + + def train(self, mode: bool = True): + if not mode: + return super().train(mode) + set_requires_grad(self, ["cloud_adapter"]) + set_train(self, ["cloud_adapter"]) + + def state_dict(self, destination, prefix, keep_vars): + state = super().state_dict(destination, prefix, keep_vars) + keys = [k for k in state.keys() if "cloud_adapter" not in k] + for key in keys: + state.pop(key) + if key in destination: + destination.pop(key) + return state diff --git a/hugging_face/cloud_adapter/dbnet.py b/hugging_face/cloud_adapter/dbnet.py new file mode 100644 index 0000000..b1dfa46 --- /dev/null +++ b/hugging_face/cloud_adapter/dbnet.py @@ -0,0 +1,680 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/7/26 上午11:19 +# @Author : xiaoshun +# @Email : 3038523973@qq.com +# @File : dbnet.py +# @Software: PyCharm + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + + +# from models.Transformer.ViT import truncated_normal_ + +# Decoder细化卷积模块 +class SBR(nn.Module): + def __init__(self, in_ch): + super(SBR, self).__init__() + self.conv1x3 = nn.Sequential( + nn.Conv2d(in_ch, in_ch, kernel_size=(1, 3), stride=1, padding=(0, 1)), + nn.BatchNorm2d(in_ch), + nn.ReLU(True) + ) + self.conv3x1 = nn.Sequential( + nn.Conv2d(in_ch, in_ch, kernel_size=(3, 1), stride=1, padding=(1, 0)), + nn.BatchNorm2d(in_ch), + nn.ReLU(True) + ) + + def forward(self, x): + out = self.conv3x1(self.conv1x3(x)) # 先进行1x3的卷积,得到结果并将结果再进行3x1的卷积 + return out + x + + +# 下采样卷积模块 stage 1,2,3 +class c_stage123(nn.Module): + def __init__(self, in_chans, out_chans): + super().__init__() + self.stage123 = nn.Sequential( + nn.Conv2d(in_channels=in_chans, out_channels=out_chans, kernel_size=3, stride=2, padding=1), + nn.BatchNorm2d(out_chans), + nn.ReLU(), + nn.Conv2d(in_channels=out_chans, out_channels=out_chans, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(out_chans), + nn.ReLU(), + ) + self.conv1x1_123 = nn.Conv2d(in_channels=in_chans, out_channels=out_chans, kernel_size=1) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + def forward(self, x): + stage123 = self.stage123(x) # 3*3卷积,两倍下采样 3*224*224-->64*112*112 + max = self.maxpool(x) # 最大值池化,两倍下采样 3*224*224-->3*112*112 + max = self.conv1x1_123(max) # 1*1卷积 3*112*112-->64*112*112 + stage123 = stage123 + max # 残差结构,广播机制 + return stage123 + + +# 下采样卷积模块 stage4,5 +class c_stage45(nn.Module): + def __init__(self, in_chans, out_chans): + super().__init__() + self.stage45 = nn.Sequential( + nn.Conv2d(in_channels=in_chans, out_channels=out_chans, kernel_size=3, stride=2, padding=1), + nn.BatchNorm2d(out_chans), + nn.ReLU(), + nn.Conv2d(in_channels=out_chans, out_channels=out_chans, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(out_chans), + nn.ReLU(), + nn.Conv2d(in_channels=out_chans, out_channels=out_chans, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(out_chans), + nn.ReLU(), + ) + self.conv1x1_45 = nn.Conv2d(in_channels=in_chans, out_channels=out_chans, kernel_size=1) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + def forward(self, x): + stage45 = self.stage45(x) # 3*3卷积模块 2倍下采样 + max = self.maxpool(x) # 最大值池化,两倍下采样 + max = self.conv1x1_45(max) # 1*1卷积模块 调整通道数 + stage45 = stage45 + max # 残差结构 + return stage45 + + +class Identity(nn.Module): # 恒等映射 + def __init__(self): + super().__init__() + + def forward(self, x): + return x + + +# 轻量卷积模块 +class DepthwiseConv2d(nn.Module): # 用于自注意力机制 + def __init__(self, in_chans, out_chans, kernel_size=1, stride=1, padding=0, dilation=1): + super().__init__() + # depthwise conv + self.depthwise = nn.Conv2d( + in_channels=in_chans, + out_channels=in_chans, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, # 深层卷积的膨胀率 + groups=in_chans # 指定分组卷积的组数 + ) + # batch norm + self.bn = nn.BatchNorm2d(num_features=in_chans) + + # pointwise conv 逐点卷积 + self.pointwise = nn.Conv2d( + in_channels=in_chans, + out_channels=out_chans, + kernel_size=1 + ) + + def forward(self, x): + x = self.depthwise(x) + x = self.bn(x) + x = self.pointwise(x) + return x + + +# residual skip connection 残差跳跃连接 +class Residual(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, input, **kwargs): + x = self.fn(input, **kwargs) + return (x + input) + + +# layer norm plus 层归一化 +class PreNorm(nn.Module): # 代表神经网络层 + def __init__(self, dim, fn): + super().__init__() + self.norm = nn.LayerNorm(dim) + self.fn = fn + + def forward(self, input, **kwargs): + return self.fn(self.norm(input), **kwargs) + + +# FeedForward层使得representation的表达能力更强 +class FeedForward(nn.Module): + def __init__(self, dim, hidden_dim, dropout=0.): + super().__init__() + self.net = nn.Sequential( + nn.Linear(in_features=dim, out_features=hidden_dim), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(in_features=hidden_dim, out_features=dim), + nn.Dropout(dropout) + ) + + def forward(self, input): + return self.net(input) + + +class ConvAttnetion(nn.Module): + ''' + using the Depth_Separable_Wise Conv2d to produce the q, k, v instead of using Linear Project in ViT + ''' + + def __init__(self, dim, img_size, heads=8, dim_head=64, kernel_size=3, q_stride=1, k_stride=1, v_stride=1, + dropout=0., last_stage=False): + super().__init__() + self.last_stage = last_stage + self.img_size = img_size + inner_dim = dim_head * heads # 512 + project_out = not (heads == 1 and dim_head == dim) + + self.heads = heads + self.scale = dim_head ** (-0.5) + + pad = (kernel_size - q_stride) // 2 + + self.to_q = DepthwiseConv2d(in_chans=dim, out_chans=inner_dim, kernel_size=kernel_size, stride=q_stride, + padding=pad) # 自注意力机制 + self.to_k = DepthwiseConv2d(in_chans=dim, out_chans=inner_dim, kernel_size=kernel_size, stride=k_stride, + padding=pad) + self.to_v = DepthwiseConv2d(in_chans=dim, out_chans=inner_dim, kernel_size=kernel_size, stride=v_stride, + padding=pad) + + self.to_out = nn.Sequential( + nn.Linear( + in_features=inner_dim, + out_features=dim + ), + nn.Dropout(dropout) + ) if project_out else Identity() + + def forward(self, x): + b, n, c, h = *x.shape, self.heads # * 星号的作用大概是去掉 tuple 属性吧 + + # print(x.shape) + # print('+++++++++++++++++++++++++++++++++') + + # if语句内容没有使用 + if self.last_stage: + cls_token = x[:, 0] + # print(cls_token.shape) + # print('+++++++++++++++++++++++++++++++++') + x = x[:, 1:] # 去掉每个数组的第一个元素 + + cls_token = rearrange(torch.unsqueeze(cls_token, dim=1), 'b n (h d) -> b h n d', h=h) + + # rearrange:用于对张量的维度进行重新变换排序,可用于替换pytorch中的reshape,view,transpose和permute等操作 + x = rearrange(x, 'b (l w) n -> b n l w', l=self.img_size, w=self.img_size) # [1, 3136, 64]-->1*64*56*56 + # batch_size,N(通道数),h,w + + q = self.to_q(x) # 1*64*56*56-->1*64*56*56 + # print(q.shape) + # print('++++++++++++++') + q = rearrange(q, 'b (h d) l w -> b h (l w) d', h=h) # 1*64*56*56-->1*1*3136*64 + # print(q.shape) + # print('=====================') + # batch_size,head,h*w,dim_head + + k = self.to_k(x) # 操作和q一样 + k = rearrange(k, 'b (h d) l w -> b h (l w) d', h=h) + # batch_size,head,h*w,dim_head + + v = self.to_v(x) ##操作和q一样 + # print(v.shape) + # print('[[[[[[[[[[[[[[[[[[[[[[[[[[[[') + v = rearrange(v, 'b (h d) l w -> b h (l w) d', h=h) + # print(v.shape) + # print(']]]]]]]]]]]]]]]]]]]]]]]]]]]') + # batch_size,head,h*w,dim_head + + if self.last_stage: + # print(q.shape) + # print('================') + q = torch.cat([cls_token, q], dim=2) + # print(q.shape) + # print('++++++++++++++++++') + v = torch.cat([cls_token, v], dim=2) + k = torch.cat([cls_token, k], dim=2) + + # calculate attention by matmul + scale + # permute:(batch_size,head,dim_head,h*w + # print(k.shape) + # print('++++++++++++++++++++') + k = k.permute(0, 1, 3, 2) # 1*1*3136*64-->1*1*64*3136 + # print(k.shape) + # print('====================') + attention = (q.matmul(k)) # 1*1*3136*3136 + # print(attention.shape) + # print('--------------------') + attention = attention * self.scale # 可以得到一个logit的向量,避免出现梯度下降和梯度爆炸 + # print(attention.shape) + # print('####################') + # pass a softmax + attention = F.softmax(attention, dim=-1) + # print(attention.shape) + # print('********************') + + # matmul v + # attention.matmul(v):(batch_size,head,h*w,dim_head) + # permute:(batch_size,h*w,head,dim_head) + out = (attention.matmul(v)).permute(0, 2, 1, 3).reshape(b, n, + c) # 1*3136*64 这些操作的目的是将注意力权重和值向量相乘后得到的结果进行重塑,得到一个形状为 (batch size, 序列长度, 值向量或矩阵的维度) 的张量 + + # linear project + out = self.to_out(out) + return out + + +# Reshape Layers +class Rearrange(nn.Module): + def __init__(self, string, h, w): + super().__init__() + self.string = string + self.h = h + self.w = w + + def forward(self, input): + + if self.string == 'b c h w -> b (h w) c': + N, C, H, W = input.shape + # print(input.shape) + x = torch.reshape(input, shape=(N, -1, self.h * self.w)).permute(0, 2, 1) + # print(x.shape) + # print('+++++++++++++++++++') + if self.string == 'b (h w) c -> b c h w': + N, _, C = input.shape + # print(input.shape) + x = torch.reshape(input, shape=(N, self.h, self.w, -1)).permute(0, 3, 1, 2) + # print(x.shape) + # print('=====================') + return x + + +# Transformer layers +class Transformer(nn.Module): + def __init__(self, dim, img_size, depth, heads, dim_head, mlp_dim, dropout=0., last_stage=False): + super().__init__() + self.layers = nn.ModuleList([ # 管理子模块,参数注册 + nn.ModuleList([ + PreNorm(dim=dim, fn=ConvAttnetion(dim, img_size, heads=heads, dim_head=dim_head, dropout=dropout, + last_stage=last_stage)), # 归一化,重参数化 + PreNorm(dim=dim, fn=FeedForward(dim=dim, hidden_dim=mlp_dim, dropout=dropout)) + ]) for _ in range(depth) + ]) + + def forward(self, x): + for attn, ff in self.layers: + x = x + attn(x) + x = x + ff(x) + return x + + +class DBNet(nn.Module): # 最主要的大函数 + def __init__(self, img_size, in_channels, num_classes, dim=64, kernels=[7, 3, 3, 3], strides=[4, 2, 2, 2], + heads=[1, 3, 6, 6], + depth=[1, 2, 10, 10], pool='cls', dropout=0., emb_dropout=0., scale_dim=4, ): + super().__init__() + + assert pool in ['cls', 'mean'], f'pool type must be either cls or mean pooling' + self.pool = pool + self.dim = dim + + # stage1 + # k:7 s:4 in: 1, 64, 56, 56 out: 1, 3136, 64 + self.stage1_conv_embed = nn.Sequential( + nn.Conv2d( # 1*3*224*224-->[1, 64, 56, 56] + in_channels=in_channels, + out_channels=dim, + kernel_size=kernels[0], + stride=strides[0], + padding=2 + ), + Rearrange('b c h w -> b (h w) c', h=img_size // 4, w=img_size // 4), # [1, 64, 56, 56]-->[1, 3136, 64] + nn.LayerNorm(dim) # 对每个batch归一化 + ) + + self.stage1_transformer = nn.Sequential( + Transformer( # + dim=dim, + img_size=img_size // 4, + depth=depth[0], # Transformer层中的编码器和解码器层数。 + heads=heads[0], + dim_head=self.dim, # 它是每个注意力头的维度大小,通常是嵌入维度除以头数。 + mlp_dim=dim * scale_dim, # mlp_dim:它是Transformer中前馈神经网络的隐藏层维度大小,通常是嵌入维度乘以一个缩放因子。 + dropout=dropout, + # last_stage=last_stage #它是一个标志位,用于表示该Transformer层是否是最后一层。 + ), + Rearrange('b (h w) c -> b c h w', h=img_size // 4, w=img_size // 4) + ) + + # stage2 + # k:3 s:2 in: 1, 192, 28, 28 out: 1, 784, 192 + in_channels = dim + scale = heads[1] // heads[0] + dim = scale * dim + + self.stage2_conv_embed = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=dim, + kernel_size=kernels[1], + stride=strides[1], + padding=1 + ), + Rearrange('b c h w -> b (h w) c', h=img_size // 8, w=img_size // 8), + nn.LayerNorm(dim) + ) + + self.stage2_transformer = nn.Sequential( + Transformer( + dim=dim, + img_size=img_size // 8, + depth=depth[1], + heads=heads[1], + dim_head=self.dim, + mlp_dim=dim * scale_dim, + dropout=dropout + ), + Rearrange('b (h w) c -> b c h w', h=img_size // 8, w=img_size // 8) + ) + + # stage3 + in_channels = dim + scale = heads[2] // heads[1] + dim = scale * dim + + self.stage3_conv_embed = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=dim, + kernel_size=kernels[2], + stride=strides[2], + padding=1 + ), + Rearrange('b c h w -> b (h w) c', h=img_size // 16, w=img_size // 16), + nn.LayerNorm(dim) + ) + + self.stage3_transformer = nn.Sequential( + Transformer( + dim=dim, + img_size=img_size // 16, + depth=depth[2], + heads=heads[2], + dim_head=self.dim, + mlp_dim=dim * scale_dim, + dropout=dropout + ), + Rearrange('b (h w) c -> b c h w', h=img_size // 16, w=img_size // 16) + ) + + # stage4 + in_channels = dim + scale = heads[3] // heads[2] + dim = scale * dim + + self.stage4_conv_embed = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=dim, + kernel_size=kernels[3], + stride=strides[3], + padding=1 + ), + Rearrange('b c h w -> b (h w) c', h=img_size // 32, w=img_size // 32), + nn.LayerNorm(dim) + ) + + self.stage4_transformer = nn.Sequential( + Transformer( + dim=dim, img_size=img_size // 32, + depth=depth[3], + heads=heads[3], + dim_head=self.dim, + mlp_dim=dim * scale_dim, + dropout=dropout, + ), + Rearrange('b (h w) c -> b c h w', h=img_size // 32, w=img_size // 32) + ) + + ### CNN Branch ### + self.c_stage1 = c_stage123(in_chans=3, out_chans=64) + self.c_stage2 = c_stage123(in_chans=64, out_chans=128) + self.c_stage3 = c_stage123(in_chans=128, out_chans=384) + self.c_stage4 = c_stage45(in_chans=384, out_chans=512) + self.c_stage5 = c_stage45(in_chans=512, out_chans=1024) + self.c_max = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.up_conv1 = nn.Conv2d(in_channels=192, out_channels=128, kernel_size=1) + self.up_conv2 = nn.Conv2d(in_channels=384, out_channels=512, kernel_size=1) + + ### CTmerge ### + self.CTmerge1 = nn.Sequential( + nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(64), + nn.ReLU(), + nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(64), + nn.ReLU(), + ) + self.CTmerge2 = nn.Sequential( + nn.Conv2d(in_channels=320, out_channels=128, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(128), + nn.ReLU(), + nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(128), + nn.ReLU(), + ) + self.CTmerge3 = nn.Sequential( + nn.Conv2d(in_channels=768, out_channels=512, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(512), + nn.ReLU(), + nn.Conv2d(in_channels=512, out_channels=384, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(384), + nn.ReLU(), + nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(384), + nn.ReLU(), + ) + + self.CTmerge4 = nn.Sequential( + nn.Conv2d(in_channels=896, out_channels=640, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(640), + nn.ReLU(), + nn.Conv2d(in_channels=640, out_channels=512, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(512), + nn.ReLU(), + nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(512), + nn.ReLU(), + ) + + # decoder + self.decoder4 = nn.Sequential( + DepthwiseConv2d( + in_chans=1408, + out_chans=1024, + kernel_size=3, + stride=1, + padding=1 + ), + DepthwiseConv2d( + in_chans=1024, + out_chans=512, + kernel_size=3, + stride=1, + padding=1 + ), + nn.GELU() + ) + self.decoder3 = nn.Sequential( + DepthwiseConv2d( + in_chans=896, + out_chans=512, + kernel_size=3, + stride=1, + padding=1 + ), + DepthwiseConv2d( + in_chans=512, + out_chans=384, + kernel_size=3, + stride=1, + padding=1 + ), + nn.GELU() + ) + + self.decoder2 = nn.Sequential( + DepthwiseConv2d( + in_chans=576, + out_chans=256, + kernel_size=3, + stride=1, + padding=1 + ), + DepthwiseConv2d( + in_chans=256, + out_chans=192, + kernel_size=3, + stride=1, + padding=1 + ), + nn.GELU() + ) + + self.decoder1 = nn.Sequential( + DepthwiseConv2d( + in_chans=256, + out_chans=64, + kernel_size=3, + stride=1, + padding=1 + ), + DepthwiseConv2d( + in_chans=64, + out_chans=16, + kernel_size=3, + stride=1, + padding=1 + ), + nn.GELU() + ) + self.sbr4 = SBR(512) + self.sbr3 = SBR(384) + self.sbr2 = SBR(192) + self.sbr1 = SBR(16) + + self.head = nn.Conv2d(in_channels=16, out_channels=num_classes, kernel_size=1) + + def forward(self, input): + ### encoder ### + # stage1 = ts1 cat cs1 + # t_s1 = self.t_stage1(input) + # print(input.shape) + # print('++++++++++++++++++++++') + + t_s1 = self.stage1_conv_embed(input) # 1*3*224*224-->1*3136*64 + + # print(t_s1.shape) + # print('======================') + + t_s1 = self.stage1_transformer(t_s1) # 1*3136*64-->1*64*56*56 + + # print(t_s1.shape) + # print('----------------------') + + c_s1 = self.c_stage1(input) # 1*3*224*224-->1*64*112*112 + + # print(c_s1.shape) + # print('!!!!!!!!!!!!!!!!!!!!!!!') + + stage1 = self.CTmerge1(torch.cat([t_s1, self.c_max(c_s1)], dim=1)) # 1*64*56*56 # 拼接两条分支 + + # print(stage1.shape) + # print('[[[[[[[[[[[[[[[[[[[[[[[') + + # stage2 = ts2 up cs2 + # t_s2 = self.t_stage2(stage1) + t_s2 = self.stage2_conv_embed(stage1) # 1*64*56*56-->1*784*192 # stage2_conv_embed是转化为序列操作 + + # print(t_s2.shape) + # print('[[[[[[[[[[[[[[[[[[[[[[[') + t_s2 = self.stage2_transformer(t_s2) # 1*784*192-->1*192*28*28 + # print(t_s2.shape) + # print('+++++++++++++++++++++++++') + + c_s2 = self.c_stage2(c_s1) # 1*64*112*112-->1*128*56*56 + stage2 = self.CTmerge2( + torch.cat([c_s2, F.interpolate(t_s2, size=c_s2.size()[2:], mode='bilinear', align_corners=True)], + dim=1)) # mode='bilinear'表示使用双线性插值 1*128*56*56 + + # stage3 = ts3 cat cs3 + # t_s3 = self.t_stage3(t_s2) + t_s3 = self.stage3_conv_embed(t_s2) # 1*192*28*28-->1*196*384 + # print(t_s3.shape) + # print('///////////////////////') + t_s3 = self.stage3_transformer(t_s3) # 1*196*384-->1*384*14*14 + # print(t_s3.shape) + # print('....................') + c_s3 = self.c_stage3(stage2) # 1*128*56*56-->1*384*28*28 + stage3 = self.CTmerge3(torch.cat([t_s3, self.c_max(c_s3)], dim=1)) # 1*384*14*14 + + # stage4 = ts4 up cs4 + # t_s4 = self.t_stage4(stage3) + t_s4 = self.stage4_conv_embed(stage3) # 1*384*14*14-->1*49*384 + # print(t_s4.shape) + # print(';;;;;;;;;;;;;;;;;;;;;;;') + t_s4 = self.stage4_transformer(t_s4) # 1*49*384-->1*384*7*7 + # print(t_s4.shape) + # print('::::::::::::::::::::') + + c_s4 = self.c_stage4(c_s3) # 1*384*28*28-->1*512*14*14 + stage4 = self.CTmerge4( + torch.cat([c_s4, F.interpolate(t_s4, size=c_s4.size()[2:], mode='bilinear', align_corners=True)], + dim=1)) # 1*512*14*14 + + # cs5 + c_s5 = self.c_stage5(stage4) # 1*512*14*14-->1*1024*7*7 + + ### decoder ### + decoder4 = torch.cat([c_s5, t_s4], dim=1) # 1*1408*7*7 + decoder4 = self.decoder4(decoder4) # 1*1408*7*7-->1*512*7*7 + decoder4 = F.interpolate(decoder4, size=c_s3.size()[2:], mode='bilinear', + align_corners=True) # 1*512*7*7-->1*512*28*28 + decoder4 = self.sbr4(decoder4) # 1*512*28*28 + # print(decoder4.shape) + + decoder3 = torch.cat([decoder4, c_s3], dim=1) # 1*896*28*28 + decoder3 = self.decoder3(decoder3) # 1*384*28*28 + decoder3 = F.interpolate(decoder3, size=t_s2.size()[2:], mode='bilinear', align_corners=True) # 1*384*28*28 + decoder3 = self.sbr3(decoder3) # 1*384*28*28 + # print(decoder3.shape) + + decoder2 = torch.cat([decoder3, t_s2], dim=1) # 1*576*28*28 + decoder2 = self.decoder2(decoder2) # 1*192*28*28 + decoder2 = F.interpolate(decoder2, size=c_s1.size()[2:], mode='bilinear', align_corners=True) # 1*192*112*112 + decoder2 = self.sbr2(decoder2) # 1*192*112*112 + # print(decoder2.shape) + + decoder1 = torch.cat([decoder2, c_s1], dim=1) # 1*256*112*112 + decoder1 = self.decoder1(decoder1) # 1*16*112*112 + # print(decoder1.shape) + final = F.interpolate(decoder1, size=input.size()[2:], mode='bilinear', align_corners=True) # 1*16*224*224 + # print(final.shape) + # final = self.sbr1(decoder1) + # print(final.shape) + final = self.head(final) # 1*3*224*224 + + return final + + +if __name__ == '__main__': + x = torch.rand(1, 3, 224, 224).cuda() + model = DBNet(img_size=224, in_channels=3, num_classes=7).cuda() + y = model(x) + print(y.shape) + # torch.Size([1, 7, 224, 224]) \ No newline at end of file diff --git a/hugging_face/cloud_adapter/dino_layers/__init__.py b/hugging_face/cloud_adapter/dino_layers/__init__.py new file mode 100644 index 0000000..0498f46 --- /dev/null +++ b/hugging_face/cloud_adapter/dino_layers/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +from .dino_head import DINOHead +from .mlp import Mlp +from .patch_embed import PatchEmbed +from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused +from .block import NestedTensorBlock,drop_add_residual_stochastic_depth +from .attention import MemEffAttention \ No newline at end of file diff --git a/hugging_face/cloud_adapter/dino_layers/attention.py b/hugging_face/cloud_adapter/dino_layers/attention.py new file mode 100644 index 0000000..0fb76ef --- /dev/null +++ b/hugging_face/cloud_adapter/dino_layers/attention.py @@ -0,0 +1,89 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +import logging +import os +import warnings + +from torch import Tensor +from torch import nn + + +logger = logging.getLogger("dinov2") + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +try: + if XFORMERS_ENABLED: + from xformers.ops import memory_efficient_attention, unbind + + XFORMERS_AVAILABLE = True + warnings.warn("xFormers is available (Attention)") + else: + warnings.warn("xFormers is disabled (Attention)") + raise ImportError +except ImportError: + XFORMERS_AVAILABLE = False + warnings.warn("xFormers is not available (Attention)") + + +class Attention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: Tensor) -> Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + attn = q @ k.transpose(-2, -1) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class MemEffAttention(Attention): + def forward(self, x: Tensor, attn_bias=None) -> Tensor: + if not XFORMERS_AVAILABLE: + if attn_bias is not None: + raise AssertionError("xFormers is required for using nested tensors") + return super().forward(x) + + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + + q, k, v = unbind(qkv, 2) + + x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) + x = x.reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + return x diff --git a/hugging_face/cloud_adapter/dino_layers/block.py b/hugging_face/cloud_adapter/dino_layers/block.py new file mode 100644 index 0000000..930787b --- /dev/null +++ b/hugging_face/cloud_adapter/dino_layers/block.py @@ -0,0 +1,260 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +import logging +import os +from typing import Callable, List, Any, Tuple, Dict +import warnings + +import torch +from torch import nn, Tensor + +from .attention import Attention, MemEffAttention +from .drop_path import DropPath +from .layer_scale import LayerScale +from .mlp import Mlp + + +logger = logging.getLogger("dinov2") + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +try: + if XFORMERS_ENABLED: + from xformers.ops import fmha, scaled_index_add, index_select_cat + + XFORMERS_AVAILABLE = True + warnings.warn("xFormers is available (Block)") + else: + warnings.warn("xFormers is disabled (Block)") + raise ImportError +except ImportError: + XFORMERS_AVAILABLE = False + + warnings.warn("xFormers is not available (Block)") + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_bias: bool = True, + ffn_bias: bool = True, + drop: float = 0.0, + attn_drop: float = 0.0, + init_values=None, + drop_path: float = 0.0, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = Attention, + ffn_layer: Callable[..., nn.Module] = Mlp, + ) -> None: + super().__init__() + # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}") + self.norm1 = norm_layer(dim) + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + bias=ffn_bias, + ) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.sample_drop_ratio = drop_path + + def forward(self, x: Tensor) -> Tensor: + def attn_residual_func(x: Tensor) -> Tensor: + return self.ls1(self.attn(self.norm1(x))) + + def ffn_residual_func(x: Tensor) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + if self.training and self.sample_drop_ratio > 0.1: + # the overhead is compensated only for a drop path rate larger than 0.1 + x = drop_add_residual_stochastic_depth( + x, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + x = drop_add_residual_stochastic_depth( + x, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + elif self.training and self.sample_drop_ratio > 0.0: + x = x + self.drop_path1(attn_residual_func(x)) + x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2 + else: + x = x + attn_residual_func(x) + x = x + ffn_residual_func(x) + return x + + +def drop_add_residual_stochastic_depth( + x: Tensor, + residual_func: Callable[[Tensor], Tensor], + sample_drop_ratio: float = 0.0, +) -> Tensor: + # 1) extract subset using permutation + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + x_subset = x[brange] + + # 2) apply residual_func to get residual + residual = residual_func(x_subset) + + x_flat = x.flatten(1) + residual = residual.flatten(1) + + residual_scale_factor = b / sample_subset_size + + # 3) add the residual + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + return x_plus_residual.view_as(x) + + +def get_branges_scales(x, sample_drop_ratio=0.0): + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + residual_scale_factor = b / sample_subset_size + return brange, residual_scale_factor + + +def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None): + if scaling_vector is None: + x_flat = x.flatten(1) + residual = residual.flatten(1) + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + else: + x_plus_residual = scaled_index_add( + x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor + ) + return x_plus_residual + + +attn_bias_cache: Dict[Tuple, Any] = {} + + +def get_attn_bias_and_cat(x_list, branges=None): + """ + this will perform the index select, cat the tensors, and provide the attn_bias from cache + """ + batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list] + all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) + if all_shapes not in attn_bias_cache.keys(): + seqlens = [] + for b, x in zip(batch_sizes, x_list): + for _ in range(b): + seqlens.append(x.shape[1]) + attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) + attn_bias._batch_sizes = batch_sizes + attn_bias_cache[all_shapes] = attn_bias + + if branges is not None: + cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1]) + else: + tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list) + cat_tensors = torch.cat(tensors_bs1, dim=1) + + return attn_bias_cache[all_shapes], cat_tensors + + +def drop_add_residual_stochastic_depth_list( + x_list: List[Tensor], + residual_func: Callable[[Tensor, Any], Tensor], + sample_drop_ratio: float = 0.0, + scaling_vector=None, +) -> Tensor: + # 1) generate random set of indices for dropping samples in the batch + branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list] + branges = [s[0] for s in branges_scales] + residual_scale_factors = [s[1] for s in branges_scales] + + # 2) get attention bias and index+concat the tensors + attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges) + + # 3) apply residual_func to get residual, and split the result + residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore + + outputs = [] + for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors): + outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x)) + return outputs + + +class NestedTensorBlock(Block): + def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]: + """ + x_list contains a list of tensors to nest together and run + """ + assert isinstance(self.attn, MemEffAttention) + + if self.training and self.sample_drop_ratio > 0.0: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.attn(self.norm1(x), attn_bias=attn_bias) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.mlp(self.norm2(x)) + + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None, + ) + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None, + ) + return x_list + else: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias)) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + attn_bias, x = get_attn_bias_and_cat(x_list) + x = x + attn_residual_func(x, attn_bias=attn_bias) + x = x + ffn_residual_func(x) + return attn_bias.split(x) + + def forward(self, x_or_x_list): + if isinstance(x_or_x_list, Tensor): + return super().forward(x_or_x_list) + elif isinstance(x_or_x_list, list): + if not XFORMERS_AVAILABLE: + raise AssertionError("xFormers is required for using nested tensors") + return self.forward_nested(x_or_x_list) + else: + raise AssertionError diff --git a/hugging_face/cloud_adapter/dino_layers/dino_head.py b/hugging_face/cloud_adapter/dino_layers/dino_head.py new file mode 100644 index 0000000..0ace8ff --- /dev/null +++ b/hugging_face/cloud_adapter/dino_layers/dino_head.py @@ -0,0 +1,58 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +from torch.nn.init import trunc_normal_ +from torch.nn.utils import weight_norm + + +class DINOHead(nn.Module): + def __init__( + self, + in_dim, + out_dim, + use_bn=False, + nlayers=3, + hidden_dim=2048, + bottleneck_dim=256, + mlp_bias=True, + ): + super().__init__() + nlayers = max(nlayers, 1) + self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias) + self.apply(self._init_weights) + self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False)) + self.last_layer.weight_g.data.fill_(1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.mlp(x) + eps = 1e-6 if x.dtype == torch.float16 else 1e-12 + x = nn.functional.normalize(x, dim=-1, p=2, eps=eps) + x = self.last_layer(x) + return x + + +def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True): + if nlayers == 1: + return nn.Linear(in_dim, bottleneck_dim, bias=bias) + else: + layers = [nn.Linear(in_dim, hidden_dim, bias=bias)] + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + for _ in range(nlayers - 2): + layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias)) + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias)) + return nn.Sequential(*layers) diff --git a/hugging_face/cloud_adapter/dino_layers/drop_path.py b/hugging_face/cloud_adapter/dino_layers/drop_path.py new file mode 100644 index 0000000..1d640e0 --- /dev/null +++ b/hugging_face/cloud_adapter/dino_layers/drop_path.py @@ -0,0 +1,34 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py + + +from torch import nn + + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0: + random_tensor.div_(keep_prob) + output = x * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) diff --git a/hugging_face/cloud_adapter/dino_layers/layer_scale.py b/hugging_face/cloud_adapter/dino_layers/layer_scale.py new file mode 100644 index 0000000..51df0d7 --- /dev/null +++ b/hugging_face/cloud_adapter/dino_layers/layer_scale.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 + +from typing import Union + +import torch +from torch import Tensor +from torch import nn + + +class LayerScale(nn.Module): + def __init__( + self, + dim: int, + init_values: Union[float, Tensor] = 1e-5, + inplace: bool = False, + ) -> None: + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x: Tensor) -> Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma diff --git a/hugging_face/cloud_adapter/dino_layers/mlp.py b/hugging_face/cloud_adapter/dino_layers/mlp.py new file mode 100644 index 0000000..bbf9432 --- /dev/null +++ b/hugging_face/cloud_adapter/dino_layers/mlp.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py + + +from typing import Callable, Optional + +from torch import Tensor, nn + + +class Mlp(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = nn.GELU, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) + self.drop = nn.Dropout(drop) + + def forward(self, x: Tensor) -> Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x diff --git a/hugging_face/cloud_adapter/dino_layers/patch_embed.py b/hugging_face/cloud_adapter/dino_layers/patch_embed.py new file mode 100644 index 0000000..8b7c080 --- /dev/null +++ b/hugging_face/cloud_adapter/dino_layers/patch_embed.py @@ -0,0 +1,88 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +from typing import Callable, Optional, Tuple, Union + +from torch import Tensor +import torch.nn as nn + + +def make_2tuple(x): + if isinstance(x, tuple): + assert len(x) == 2 + return x + + assert isinstance(x, int) + return (x, x) + + +class PatchEmbed(nn.Module): + """ + 2D image to patch embedding: (B,C,H,W) -> (B,N,D) + + Args: + img_size: Image size. + patch_size: Patch token size. + in_chans: Number of input image channels. + embed_dim: Number of linear projection output channels. + norm_layer: Normalization layer. + """ + + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer: Optional[Callable] = None, + flatten_embedding: bool = True, + ) -> None: + super().__init__() + + image_HW = make_2tuple(img_size) + patch_HW = make_2tuple(patch_size) + patch_grid_size = ( + image_HW[0] // patch_HW[0], + image_HW[1] // patch_HW[1], + ) + + self.img_size = image_HW + self.patch_size = patch_HW + self.patches_resolution = patch_grid_size + self.num_patches = patch_grid_size[0] * patch_grid_size[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.flatten_embedding = flatten_embedding + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + _, _, H, W = x.shape + patch_H, patch_W = self.patch_size + + assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" + assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" + + x = self.proj(x) # B C H W + H, W = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) # B HW C + x = self.norm(x) + if not self.flatten_embedding: + x = x.reshape(-1, H, W, self.embed_dim) # B H W C + return x + + def flops(self) -> float: + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops diff --git a/hugging_face/cloud_adapter/dino_layers/swiglu_ffn.py b/hugging_face/cloud_adapter/dino_layers/swiglu_ffn.py new file mode 100644 index 0000000..5e9dafa --- /dev/null +++ b/hugging_face/cloud_adapter/dino_layers/swiglu_ffn.py @@ -0,0 +1,72 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +import os +from typing import Callable, Optional +import warnings + +from torch import Tensor, nn +import torch.nn.functional as F + + +class SwiGLUFFN(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) + self.w3 = nn.Linear(hidden_features, out_features, bias=bias) + + def forward(self, x: Tensor) -> Tensor: + x12 = self.w12(x) + x1, x2 = x12.chunk(2, dim=-1) + hidden = F.silu(x1) * x2 + return self.w3(hidden) + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +try: + if XFORMERS_ENABLED: + from xformers.ops import SwiGLU + + XFORMERS_AVAILABLE = True + warnings.warn("xFormers is available (SwiGLU)") + else: + warnings.warn("xFormers is disabled (SwiGLU)") + raise ImportError +except ImportError: + SwiGLU = SwiGLUFFN + XFORMERS_AVAILABLE = False + + warnings.warn("xFormers is not available (SwiGLU)") + + +class SwiGLUFFNFused(SwiGLU): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + out_features = out_features or in_features + hidden_features = hidden_features or in_features + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + super().__init__( + in_features=in_features, + hidden_features=hidden_features, + out_features=out_features, + bias=bias, + ) diff --git a/hugging_face/cloud_adapter/dino_v2.py b/hugging_face/cloud_adapter/dino_v2.py new file mode 100644 index 0000000..59ea195 --- /dev/null +++ b/hugging_face/cloud_adapter/dino_v2.py @@ -0,0 +1,353 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +from functools import partial +import math +from typing import Sequence, Tuple, Union, Callable + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from mmseg.models.builder import BACKBONES +from mmengine.model import BaseModule +import torch.nn.functional as F +from .dino_layers import ( + Mlp, + PatchEmbed, + SwiGLUFFNFused, + MemEffAttention, + NestedTensorBlock as Block, +) + + +def named_apply( + fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False +) -> nn.Module: + if not depth_first and include_root: + fn(module=module, name=name) + for child_name, child_module in module.named_children(): + child_name = ".".join((name, child_name)) if name else child_name + named_apply( + fn=fn, + module=child_module, + name=child_name, + depth_first=depth_first, + include_root=True, + ) + if depth_first and include_root: + fn(module=module, name=name) + return module + + +class BlockChunk(nn.ModuleList): + def forward(self, x): + for b in self: + x = b(x) + return x + + +@BACKBONES.register_module() +class DinoVisionTransformer(BaseModule): + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + ffn_bias=True, + proj_bias=True, + drop_path_rate=0.0, + drop_path_uniform=False, + init_values=None, # for layerscale: None or 0 => no layerscale + embed_layer=PatchEmbed, + act_layer=nn.GELU, + block_fn=partial(Block, attn_class=MemEffAttention), + ffn_layer="mlp", + block_chunks=1, + out_indices=[7, 11, 15, 23], + init_cfg=None, + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + proj_bias (bool): enable bias for proj in attn if True + ffn_bias (bool): enable bias for ffn if True + drop_path_rate (float): stochastic depth rate + drop_path_uniform (bool): apply uniform drop rate across blocks + weight_init (str): weight init scheme + init_values (float): layer-scale init values + embed_layer (nn.Module): patch embedding layer + act_layer (nn.Module): MLP activation layer + block_fn (nn.Module): transformer block class + ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity" + block_chunks: (int) split block sequence into block_chunks units for FSDP wrap + """ + super().__init__(init_cfg) + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.out_indices = out_indices + self.drop_path_rate = drop_path_rate + self.num_features = ( + self.embed_dim + ) = embed_dim # num_features for consistency with other models + self.num_tokens = 1 + self.n_blocks = depth + self.num_heads = num_heads + self.norm_layer = norm_layer + self.patch_size = patch_size + + self.patch_embed = embed_layer( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches + self.num_tokens, embed_dim) + ) + + if drop_path_uniform is True: + dpr = [drop_path_rate] * depth + else: + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + + if ffn_layer == "mlp": + ffn_layer = Mlp + elif ffn_layer == "swiglufused" or ffn_layer == "swiglu": + ffn_layer = SwiGLUFFNFused + elif ffn_layer == "identity": + + def f(*args, **kwargs): + return nn.Identity() + + ffn_layer = f + else: + raise NotImplementedError + + blocks_list = [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + ffn_layer=ffn_layer, + init_values=init_values, + ) + for i in range(depth) + ] + if block_chunks > 0: + self.chunked_blocks = True + chunked_blocks = [] + chunksize = depth // block_chunks + for i in range(0, depth, chunksize): + # this is to keep the block index consistent if we chunk the block list + chunked_blocks.append( + [nn.Identity()] * i + blocks_list[i : i + chunksize] + ) + self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks]) + else: + self.chunked_blocks = False + self.blocks = nn.ModuleList(blocks_list) + + self.norm = norm_layer(embed_dim) + self.head = nn.Identity() + + self.mask_token = nn.Parameter(torch.zeros(1, embed_dim)) + + def interpolate_pos_encoding(self, x, w, h): + previous_dtype = x.dtype + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + if npatch == N and w == h: + return self.pos_embed + pos_embed = self.pos_embed.float() + class_pos_embed = pos_embed[:, 0] + patch_pos_embed = pos_embed[:, 1:] + dim = x.shape[-1] + w0 = w // self.patch_size + h0 = h // self.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + w0, h0 = w0 + 0.1, h0 + 0.1 + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape( + 1, int(math.sqrt(N)), int(math.sqrt(N)), dim + ).permute(0, 3, 1, 2), + scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), + mode="bicubic", + ) + + assert ( + int(w0) == patch_pos_embed.shape[-2] + and int(h0) == patch_pos_embed.shape[-1] + ) + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to( + previous_dtype + ) + + def prepare_tokens_with_masks(self, x, masks=None): + B, nc, w, h = x.shape + x = self.patch_embed(x) + if masks is not None: + x = torch.where( + masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x + ) + + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + x = x + self.interpolate_pos_encoding(x, w, h) + + return x + + def forward_features_list(self, x_list, masks_list): + x = [ + self.prepare_tokens_with_masks(x, masks) + for x, masks in zip(x_list, masks_list) + ] + for blk in self.blocks: + x = blk(x) + + all_x = x + output = [] + for x, masks in zip(all_x, masks_list): + x_norm = self.norm(x) + output.append( + { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_patchtokens": x_norm[:, 1:], + "x_prenorm": x, + "masks": masks, + } + ) + return output + + def forward_features(self, x, masks=None): + B, _, h, w = x.shape + if isinstance(x, list): + return self.forward_features_list(x, masks) + + x = self.prepare_tokens_with_masks(x, masks) + outs = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + if idx in self.out_indices: + outs.append( + x[:, 1:, :] + .permute(0, 2, 1) + .reshape(B, -1, h // self.patch_size, w // self.patch_size) + .contiguous() + ) + return outs + + def _get_intermediate_layers_not_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + # If n is an int, take the n last blocks. If it's a list, take them + output, total_block_len = [], len(self.blocks) + blocks_to_take = ( + range(total_block_len - n, total_block_len) if isinstance(n, int) else n + ) + for i, blk in enumerate(self.blocks): + x = blk(x) + if i in blocks_to_take: + output.append(x) + assert len(output) == len( + blocks_to_take + ), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def _get_intermediate_layers_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + output, i, total_block_len = [], 0, len(self.blocks[-1]) + # If n is an int, take the n last blocks. If it's a list, take them + blocks_to_take = ( + range(total_block_len - n, total_block_len) if isinstance(n, int) else n + ) + for block_chunk in self.blocks: + for blk in block_chunk[i:]: # Passing the nn.Identity() + x = blk(x) + if i in blocks_to_take: + output.append(x) + i += 1 + assert len(output) == len( + blocks_to_take + ), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def get_intermediate_layers( + self, + x: torch.Tensor, + n: Union[int, Sequence] = 1, # Layers or n last layers to take + reshape: bool = False, + return_class_token: bool = False, + norm=True, + ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: + if self.chunked_blocks: + outputs = self._get_intermediate_layers_chunked(x, n) + else: + outputs = self._get_intermediate_layers_not_chunked(x, n) + if norm: + outputs = [self.norm(out) for out in outputs] + class_tokens = [out[:, 0] for out in outputs] + outputs = [out[:, 1:] for out in outputs] + if reshape: + B, _, w, h = x.shape + outputs = [ + out.reshape(B, w // self.patch_size, h // self.patch_size, -1) + .permute(0, 3, 1, 2) + .contiguous() + for out in outputs + ] + if return_class_token: + return tuple(zip(outputs, class_tokens)) + return tuple(outputs) + + def forward(self, *args, **kwargs): + ret = self.forward_features(*args, **kwargs) + if isinstance(ret[0], torch.Tensor): + ret[0] = F.interpolate( + ret[0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[1] = F.interpolate( + ret[1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[3] = F.interpolate( + ret[3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + else: + ret[0][0] = F.interpolate( + ret[0][0], scale_factor=4, mode="bilinear", align_corners=False + ) + ret[0][1] = F.interpolate( + ret[0][1], scale_factor=2, mode="bilinear", align_corners=False + ) + ret[0][3] = F.interpolate( + ret[0][3], scale_factor=0.5, mode="bilinear", align_corners=False + ) + return ret \ No newline at end of file diff --git a/hugging_face/cloud_adapter/hrcloudnet.py b/hugging_face/cloud_adapter/hrcloudnet.py new file mode 100644 index 0000000..fbb49a3 --- /dev/null +++ b/hugging_face/cloud_adapter/hrcloudnet.py @@ -0,0 +1,751 @@ +# 论文地址:https://arxiv.org/abs/2407.07365 +# +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging +import os + +import numpy as np +import torch +import torch._utils +import torch.nn as nn +import torch.nn.functional as F + +BatchNorm2d = nn.BatchNorm2d +# BN_MOMENTUM = 0.01 +relu_inplace = True +BN_MOMENTUM = 0.1 +ALIGN_CORNERS = True + +logger = logging.getLogger(__name__) + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +from yacs.config import CfgNode as CN +import math +from einops import rearrange + +# configs for HRNet48 +HRNET_48 = CN() +HRNET_48.FINAL_CONV_KERNEL = 1 + +HRNET_48.STAGE1 = CN() +HRNET_48.STAGE1.NUM_MODULES = 1 +HRNET_48.STAGE1.NUM_BRANCHES = 1 +HRNET_48.STAGE1.NUM_BLOCKS = [4] +HRNET_48.STAGE1.NUM_CHANNELS = [64] +HRNET_48.STAGE1.BLOCK = 'BOTTLENECK' +HRNET_48.STAGE1.FUSE_METHOD = 'SUM' + +HRNET_48.STAGE2 = CN() +HRNET_48.STAGE2.NUM_MODULES = 1 +HRNET_48.STAGE2.NUM_BRANCHES = 2 +HRNET_48.STAGE2.NUM_BLOCKS = [4, 4] +HRNET_48.STAGE2.NUM_CHANNELS = [48, 96] +HRNET_48.STAGE2.BLOCK = 'BASIC' +HRNET_48.STAGE2.FUSE_METHOD = 'SUM' + +HRNET_48.STAGE3 = CN() +HRNET_48.STAGE3.NUM_MODULES = 4 +HRNET_48.STAGE3.NUM_BRANCHES = 3 +HRNET_48.STAGE3.NUM_BLOCKS = [4, 4, 4] +HRNET_48.STAGE3.NUM_CHANNELS = [48, 96, 192] +HRNET_48.STAGE3.BLOCK = 'BASIC' +HRNET_48.STAGE3.FUSE_METHOD = 'SUM' + +HRNET_48.STAGE4 = CN() +HRNET_48.STAGE4.NUM_MODULES = 3 +HRNET_48.STAGE4.NUM_BRANCHES = 4 +HRNET_48.STAGE4.NUM_BLOCKS = [4, 4, 4, 4] +HRNET_48.STAGE4.NUM_CHANNELS = [48, 96, 192, 384] +HRNET_48.STAGE4.BLOCK = 'BASIC' +HRNET_48.STAGE4.FUSE_METHOD = 'SUM' + +HRNET_32 = CN() +HRNET_32.FINAL_CONV_KERNEL = 1 + +HRNET_32.STAGE1 = CN() +HRNET_32.STAGE1.NUM_MODULES = 1 +HRNET_32.STAGE1.NUM_BRANCHES = 1 +HRNET_32.STAGE1.NUM_BLOCKS = [4] +HRNET_32.STAGE1.NUM_CHANNELS = [64] +HRNET_32.STAGE1.BLOCK = 'BOTTLENECK' +HRNET_32.STAGE1.FUSE_METHOD = 'SUM' + +HRNET_32.STAGE2 = CN() +HRNET_32.STAGE2.NUM_MODULES = 1 +HRNET_32.STAGE2.NUM_BRANCHES = 2 +HRNET_32.STAGE2.NUM_BLOCKS = [4, 4] +HRNET_32.STAGE2.NUM_CHANNELS = [32, 64] +HRNET_32.STAGE2.BLOCK = 'BASIC' +HRNET_32.STAGE2.FUSE_METHOD = 'SUM' + +HRNET_32.STAGE3 = CN() +HRNET_32.STAGE3.NUM_MODULES = 4 +HRNET_32.STAGE3.NUM_BRANCHES = 3 +HRNET_32.STAGE3.NUM_BLOCKS = [4, 4, 4] +HRNET_32.STAGE3.NUM_CHANNELS = [32, 64, 128] +HRNET_32.STAGE3.BLOCK = 'BASIC' +HRNET_32.STAGE3.FUSE_METHOD = 'SUM' + +HRNET_32.STAGE4 = CN() +HRNET_32.STAGE4.NUM_MODULES = 3 +HRNET_32.STAGE4.NUM_BRANCHES = 4 +HRNET_32.STAGE4.NUM_BLOCKS = [4, 4, 4, 4] +HRNET_32.STAGE4.NUM_CHANNELS = [32, 64, 128, 256] +HRNET_32.STAGE4.BLOCK = 'BASIC' +HRNET_32.STAGE4.FUSE_METHOD = 'SUM' + +HRNET_18 = CN() +HRNET_18.FINAL_CONV_KERNEL = 1 + +HRNET_18.STAGE1 = CN() +HRNET_18.STAGE1.NUM_MODULES = 1 +HRNET_18.STAGE1.NUM_BRANCHES = 1 +HRNET_18.STAGE1.NUM_BLOCKS = [4] +HRNET_18.STAGE1.NUM_CHANNELS = [64] +HRNET_18.STAGE1.BLOCK = 'BOTTLENECK' +HRNET_18.STAGE1.FUSE_METHOD = 'SUM' + +HRNET_18.STAGE2 = CN() +HRNET_18.STAGE2.NUM_MODULES = 1 +HRNET_18.STAGE2.NUM_BRANCHES = 2 +HRNET_18.STAGE2.NUM_BLOCKS = [4, 4] +HRNET_18.STAGE2.NUM_CHANNELS = [18, 36] +HRNET_18.STAGE2.BLOCK = 'BASIC' +HRNET_18.STAGE2.FUSE_METHOD = 'SUM' + +HRNET_18.STAGE3 = CN() +HRNET_18.STAGE3.NUM_MODULES = 4 +HRNET_18.STAGE3.NUM_BRANCHES = 3 +HRNET_18.STAGE3.NUM_BLOCKS = [4, 4, 4] +HRNET_18.STAGE3.NUM_CHANNELS = [18, 36, 72] +HRNET_18.STAGE3.BLOCK = 'BASIC' +HRNET_18.STAGE3.FUSE_METHOD = 'SUM' + +HRNET_18.STAGE4 = CN() +HRNET_18.STAGE4.NUM_MODULES = 3 +HRNET_18.STAGE4.NUM_BRANCHES = 4 +HRNET_18.STAGE4.NUM_BLOCKS = [4, 4, 4, 4] +HRNET_18.STAGE4.NUM_CHANNELS = [18, 36, 72, 144] +HRNET_18.STAGE4.BLOCK = 'BASIC' +HRNET_18.STAGE4.FUSE_METHOD = 'SUM' + + +class PPM(nn.Module): + def __init__(self, in_dim, reduction_dim, bins): + super(PPM, self).__init__() + self.features = [] + for bin in bins: + self.features.append(nn.Sequential( + nn.AdaptiveAvgPool2d(bin), + nn.Conv2d(in_dim, reduction_dim, kernel_size=1, bias=False), + nn.BatchNorm2d(reduction_dim), + nn.ReLU(inplace=True) + )) + self.features = nn.ModuleList(self.features) + + def forward(self, x): + x_size = x.size() + out = [x] + for f in self.features: + out.append(F.interpolate(f(x), x_size[2:], mode='bilinear', align_corners=True)) + return torch.cat(out, 1) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=relu_inplace) + self.conv2 = conv3x3(planes, planes) + self.bn2 = BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + out = out + residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=relu_inplace) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + # att = self.downsample(att) + out = out + residual + out = self.relu(out) + + return out + + +class HighResolutionModule(nn.Module): + def __init__(self, num_branches, blocks, num_blocks, num_inchannels, + num_channels, fuse_method, multi_scale_output=True): + super(HighResolutionModule, self).__init__() + self._check_branches( + num_branches, blocks, num_blocks, num_inchannels, num_channels) + + self.num_inchannels = num_inchannels + self.fuse_method = fuse_method + self.num_branches = num_branches + + self.multi_scale_output = multi_scale_output + + self.branches = self._make_branches( + num_branches, blocks, num_blocks, num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(inplace=relu_inplace) + + def _check_branches(self, num_branches, blocks, num_blocks, + num_inchannels, num_channels): + if num_branches != len(num_blocks): + error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format( + num_branches, len(num_blocks)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format( + num_branches, len(num_channels)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_inchannels): + error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format( + num_branches, len(num_inchannels)) + logger.error(error_msg) + raise ValueError(error_msg) + + def _make_one_branch(self, branch_index, block, num_blocks, num_channels, + stride=1): + downsample = None + if stride != 1 or \ + self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.num_inchannels[branch_index], + num_channels[branch_index] * block.expansion, + kernel_size=1, stride=stride, bias=False), + BatchNorm2d(num_channels[branch_index] * block.expansion, + momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.num_inchannels[branch_index], + num_channels[branch_index], stride, downsample)) + self.num_inchannels[branch_index] = \ + num_channels[branch_index] * block.expansion + for i in range(1, num_blocks[branch_index]): + layers.append(block(self.num_inchannels[branch_index], + num_channels[branch_index])) + + return nn.Sequential(*layers) + + # 创建平行层 + def _make_branches(self, num_branches, block, num_blocks, num_channels): + branches = [] + + for i in range(num_branches): + branches.append( + self._make_one_branch(i, block, num_blocks, num_channels)) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + if self.num_branches == 1: + return None + num_branches = self.num_branches # 3 + num_inchannels = self.num_inchannels # [48, 96, 192] + fuse_layers = [] + for i in range(num_branches if self.multi_scale_output else 1): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append(nn.Sequential( + nn.Conv2d(num_inchannels[j], + num_inchannels[i], + 1, + 1, + 0, + bias=False), + BatchNorm2d(num_inchannels[i], momentum=BN_MOMENTUM))) + elif j == i: + fuse_layer.append(None) + else: + conv3x3s = [] + for k in range(i - j): + if k == i - j - 1: + num_outchannels_conv3x3 = num_inchannels[i] + conv3x3s.append(nn.Sequential( + nn.Conv2d(num_inchannels[j], + num_outchannels_conv3x3, + 3, 2, 1, bias=False), + BatchNorm2d(num_outchannels_conv3x3, + momentum=BN_MOMENTUM))) + else: + num_outchannels_conv3x3 = num_inchannels[j] + conv3x3s.append(nn.Sequential( + nn.Conv2d(num_inchannels[j], + num_outchannels_conv3x3, + 3, 2, 1, bias=False), + BatchNorm2d(num_outchannels_conv3x3, + momentum=BN_MOMENTUM), + nn.ReLU(inplace=relu_inplace))) + fuse_layer.append(nn.Sequential(*conv3x3s)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def get_num_inchannels(self): + return self.num_inchannels + + def forward(self, x): + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + for i in range(len(self.fuse_layers)): + y = x[0] if i == 0 else self.fuse_layers[i][0](x[0]) + for j in range(1, self.num_branches): + if i == j: + y = y + x[j] + elif j > i: + width_output = x[i].shape[-1] + height_output = x[i].shape[-2] + y = y + F.interpolate( + self.fuse_layers[i][j](x[j]), + size=[height_output, width_output], + mode='bilinear', align_corners=ALIGN_CORNERS) + else: + y = y + self.fuse_layers[i][j](x[j]) + x_fuse.append(self.relu(y)) + + return x_fuse + + +blocks_dict = { + 'BASIC': BasicBlock, + 'BOTTLENECK': Bottleneck +} + + +class HRCloudNet(nn.Module): + + def __init__(self, in_channels=3,num_classes=2, base_c=48, **kwargs): + global ALIGN_CORNERS + extra = HRNET_48 + super(HRCloudNet, self).__init__() + ALIGN_CORNERS = True + # ALIGN_CORNERS = config.MODEL.ALIGN_CORNERS + self.num_classes = num_classes + # stem net + self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, stride=2, padding=1, + bias=False) + self.bn1 = BatchNorm2d(64, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, + bias=False) + self.bn2 = BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=relu_inplace) + + self.stage1_cfg = extra['STAGE1'] + num_channels = self.stage1_cfg['NUM_CHANNELS'][0] + block = blocks_dict[self.stage1_cfg['BLOCK']] + num_blocks = self.stage1_cfg['NUM_BLOCKS'][0] + self.layer1 = self._make_layer(block, 64, num_channels, num_blocks) + stage1_out_channel = block.expansion * num_channels + + self.stage2_cfg = extra['STAGE2'] + num_channels = self.stage2_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage2_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition1 = self._make_transition_layer( + [stage1_out_channel], num_channels) + self.stage2, pre_stage_channels = self._make_stage( + self.stage2_cfg, num_channels) + + self.stage3_cfg = extra['STAGE3'] + num_channels = self.stage3_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage3_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition2 = self._make_transition_layer( + pre_stage_channels, num_channels) # 只在pre[-1]与cur[-1]之间下采样? + self.stage3, pre_stage_channels = self._make_stage( + self.stage3_cfg, num_channels) + + self.stage4_cfg = extra['STAGE4'] + num_channels = self.stage4_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage4_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition3 = self._make_transition_layer( + pre_stage_channels, num_channels) + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, num_channels, multi_scale_output=True) + self.out_conv = OutConv(base_c, num_classes) + last_inp_channels = int(np.sum(pre_stage_channels)) + + self.corr = Corr(nclass=2) + self.proj = nn.Sequential( + # 512 32 + nn.Conv2d(720, 48, kernel_size=3, stride=1, padding=1, bias=True), + nn.BatchNorm2d(48), + nn.ReLU(inplace=True), + nn.Dropout2d(0.1), + ) + # self.up1 = Up(base_c * 16, base_c * 8 // factor, bilinear) + self.up2 = Up(base_c * 8, base_c * 4, True) + self.up3 = Up(base_c * 4, base_c * 2, True) + self.up4 = Up(base_c * 2, base_c, True) + fea_dim = 720 + bins = (1, 2, 3, 6) + self.ppm = PPM(fea_dim, int(fea_dim / len(bins)), bins) + fea_dim *= 2 + self.cls = nn.Sequential( + nn.Conv2d(fea_dim, 512, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(512), + nn.ReLU(inplace=True), + nn.Dropout2d(p=0.1), + nn.Conv2d(512, num_classes, kernel_size=1) + ) + + ''' + 转换层的作用有两种情况: + + 当前分支数小于之前分支数时,仅对前几个分支进行通道数调整。 + 当前分支数大于之前分支数时,新建一些转换层,对多余的分支进行下采样,改变通道数以适应后续的连接。 + 最终,这些转换层会被组合成一个 nn.ModuleList 对象,并在网络的构建过程中使用。 + 这有助于确保每个分支的通道数在不同阶段之间能够正确匹配,以便进行特征的融合和连接 + ''' + + def _make_transition_layer( + self, num_channels_pre_layer, num_channels_cur_layer): + # 现在的分支数 + num_branches_cur = len(num_channels_cur_layer) # 3 + # 处理前的分支数 + num_branches_pre = len(num_channels_pre_layer) # 2 + + transition_layers = [] + for i in range(num_branches_cur): + # 如果当前分支数小于之前分支数,仅针对第一到第二阶段 + if i < num_branches_pre: + # 如果对应层的通道数不一致,则进行转化( + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append(nn.Sequential( + + nn.Conv2d(num_channels_pre_layer[i], + num_channels_cur_layer[i], + 3, + 1, + 1, + bias=False), + BatchNorm2d( + num_channels_cur_layer[i], momentum=BN_MOMENTUM), + nn.ReLU(inplace=relu_inplace))) + else: + transition_layers.append(None) + else: # 在新建层下采样改变通道数 + conv3x3s = [] + for j in range(i + 1 - num_branches_pre): # 3 + inchannels = num_channels_pre_layer[-1] + outchannels = num_channels_cur_layer[i] \ + if j == i - num_branches_pre else inchannels + conv3x3s.append(nn.Sequential( + nn.Conv2d( + inchannels, outchannels, 3, 2, 1, bias=False), + BatchNorm2d(outchannels, momentum=BN_MOMENTUM), + nn.ReLU(inplace=relu_inplace))) + transition_layers.append(nn.Sequential(*conv3x3s)) + + return nn.ModuleList(transition_layers) + + ''' + _make_layer 函数的主要作用是创建一个由多个相同类型的残差块(Residual Block)组成的层。 + ''' + + def _make_layer(self, block, inplanes, planes, blocks, stride=1): + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(inplanes, planes, stride, downsample)) + inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(inplanes, planes)) + + return nn.Sequential(*layers) + + # 多尺度融合 + def _make_stage(self, layer_config, num_inchannels, + multi_scale_output=True): + num_modules = layer_config['NUM_MODULES'] + num_branches = layer_config['NUM_BRANCHES'] + num_blocks = layer_config['NUM_BLOCKS'] + num_channels = layer_config['NUM_CHANNELS'] + block = blocks_dict[layer_config['BLOCK']] + fuse_method = layer_config['FUSE_METHOD'] + + modules = [] + for i in range(num_modules): # 重复4次 + # multi_scale_output is only used last module + if not multi_scale_output and i == num_modules - 1: + reset_multi_scale_output = False + else: + reset_multi_scale_output = True + modules.append( + HighResolutionModule(num_branches, + block, + num_blocks, + num_inchannels, + num_channels, + fuse_method, + reset_multi_scale_output) + ) + num_inchannels = modules[-1].get_num_inchannels() + + return nn.Sequential(*modules), num_inchannels + + def forward(self, input, need_fp=True, use_corr=True): + # from ipdb import set_trace + # set_trace() + x = self.conv1(input) + x = self.bn1(x) + x = self.relu(x) + # x_176 = x + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg['NUM_BRANCHES']): # 2 + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + # Y1 + x_list = [] + for i in range(self.stage3_cfg['NUM_BRANCHES']): + if self.transition2[i] is not None: + if i < self.stage2_cfg['NUM_BRANCHES']: + x_list.append(self.transition2[i](y_list[i])) + else: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg['NUM_BRANCHES']): + if self.transition3[i] is not None: + if i < self.stage3_cfg['NUM_BRANCHES']: + x_list.append(self.transition3[i](y_list[i])) + else: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + x = self.stage4(x_list) + dict_return = {} + # Upsampling + x0_h, x0_w = x[0].size(2), x[0].size(3) + + x3 = F.interpolate(x[3], size=(x0_h, x0_w), mode='bilinear', align_corners=ALIGN_CORNERS) + # x = self.stage3_(x) + x[2] = self.up2(x[3], x[2]) + x2 = F.interpolate(x[2], size=(x0_h, x0_w), mode='bilinear', align_corners=ALIGN_CORNERS) + # x = self.stage2_(x) + x[1] = self.up3(x[2], x[1]) + x1 = F.interpolate(x[1], size=(x0_h, x0_w), mode='bilinear', align_corners=ALIGN_CORNERS) + x[0] = self.up4(x[1], x[0]) + xk = torch.cat([x[0], x1, x2, x3], 1) + # PPM + feat = self.ppm(xk) + x = self.cls(feat) + # fp分支 + if need_fp: + logits = F.interpolate(x, size=input.size()[2:], mode='bilinear', align_corners=True) + # logits = self.out_conv(torch.cat((x, nn.Dropout2d(0.5)(x)))) + out = logits + out_fp = logits + if use_corr: + proj_feats = self.proj(xk) + corr_out = self.corr(proj_feats, out) + corr_out = F.interpolate(corr_out, size=(352, 352), mode="bilinear", align_corners=True) + dict_return['corr_out'] = corr_out + dict_return['out'] = out + dict_return['out_fp'] = out_fp + + return dict_return['out'] + + out = F.interpolate(x, size=input.size()[2:], mode='bilinear', align_corners=True) + if use_corr: # True + proj_feats = self.proj(xk) + # 计算 + corr_out = self.corr(proj_feats, out) + corr_out = F.interpolate(corr_out, size=(352, 352), mode="bilinear", align_corners=True) + dict_return['corr_out'] = corr_out + dict_return['out'] = out + return dict_return['out'] + # return x + + def init_weights(self, pretrained='', ): + logger.info('=> init weights from normal distribution') + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + if os.path.isfile(pretrained): + pretrained_dict = torch.load(pretrained) + logger.info('=> loading pretrained model {}'.format(pretrained)) + model_dict = self.state_dict() + pretrained_dict = {k: v for k, v in pretrained_dict.items() + if k in model_dict.keys()} + for k, _ in pretrained_dict.items(): + logger.info( + '=> loading {} pretrained model {}'.format(k, pretrained)) + model_dict.update(pretrained_dict) + self.load_state_dict(model_dict) + + +class OutConv(nn.Sequential): + def __init__(self, in_channels, num_classes): + super(OutConv, self).__init__( + nn.Conv2d(720, num_classes, kernel_size=1) + ) + + +class DoubleConv(nn.Sequential): + def __init__(self, in_channels, out_channels, mid_channels=None): + if mid_channels is None: + mid_channels = out_channels + super(DoubleConv, self).__init__( + nn.Conv2d(in_channels + out_channels, mid_channels, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(mid_channels), + nn.ReLU(inplace=True), + nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True) + ) + + +class Up(nn.Module): + def __init__(self, in_channels, out_channels, bilinear=True): + super(Up, self).__init__() + if bilinear: + self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) + self.conv = DoubleConv(in_channels, out_channels, in_channels // 2) + else: + self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2) + self.conv = DoubleConv(in_channels, out_channels) + + def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: + x1 = self.up(x1) + # [N, C, H, W] + diff_y = x2.size()[2] - x1.size()[2] + diff_x = x2.size()[3] - x1.size()[3] + + # padding_left, padding_right, padding_top, padding_bottom + x1 = F.pad(x1, [diff_x // 2, diff_x - diff_x // 2, + diff_y // 2, diff_y - diff_y // 2]) + + x = torch.cat([x2, x1], dim=1) + x = self.conv(x) + return x + + +class Corr(nn.Module): + def __init__(self, nclass=2): + super(Corr, self).__init__() + self.nclass = nclass + self.conv1 = nn.Conv2d(48, self.nclass, kernel_size=1, stride=1, padding=0, bias=True) + self.conv2 = nn.Conv2d(48, self.nclass, kernel_size=1, stride=1, padding=0, bias=True) + + def forward(self, feature_in, out): + # in torch.Size([4, 32, 22, 22]) + # out = [4 2 352 352] + h_in, w_in = math.ceil(feature_in.shape[2] / (1)), math.ceil(feature_in.shape[3] / (1)) + out = F.interpolate(out.detach(), (h_in, w_in), mode='bilinear', align_corners=True) + feature = F.interpolate(feature_in, (h_in, w_in), mode='bilinear', align_corners=True) + f1 = rearrange(self.conv1(feature), 'n c h w -> n c (h w)') + f2 = rearrange(self.conv2(feature), 'n c h w -> n c (h w)') + out_temp = rearrange(out, 'n c h w -> n c (h w)') + corr_map = torch.matmul(f1.transpose(1, 2), f2) / torch.sqrt(torch.tensor(f1.shape[1]).float()) + corr_map = F.softmax(corr_map, dim=-1) + # out_temp 2 2 484 + # corr_map 4 484 484 + out = rearrange(torch.matmul(out_temp, corr_map), 'n c (h w) -> n c h w', h=h_in, w=w_in) + # out torch.Size([4, 2, 22, 22]) + return out + + +if __name__ == '__main__': + input = torch.randn(4, 3, 352, 352) + cloud = HRCloudNet(num_classes=2) + output = cloud(input) + print(output.shape) + # torch.Size([4, 2, 352, 352]) torch.Size([4, 2, 352, 352]) torch.Size([4, 2, 352, 352]) \ No newline at end of file diff --git a/hugging_face/cloud_adapter/kappamask.py b/hugging_face/cloud_adapter/kappamask.py new file mode 100644 index 0000000..57072c5 --- /dev/null +++ b/hugging_face/cloud_adapter/kappamask.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/8/7 下午3:51 +# @Author : xiaoshun +# @Email : 3038523973@qq.com +# @File : kappamask.py.py +# @Software: PyCharm + +import torch +from torch import nn as nn +from torch.nn import functional as F + + +class KappaMask(nn.Module): + def __init__(self, num_classes=2, in_channels=3): + super().__init__() + self.conv1 = nn.Sequential( + nn.Conv2d(in_channels, 64, 3, 1, 1), + nn.ReLU(inplace=True), + nn.Conv2d(64, 64, 3, 1, 1), + nn.ReLU(inplace=True), + ) + self.conv2 = nn.Sequential( + nn.Conv2d(64, 128, 3, 1, 1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 128, 3, 1, 1), + nn.ReLU(inplace=True), + ) + self.conv3 = nn.Sequential( + nn.Conv2d(128, 256, 3, 1, 1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, 1), + nn.ReLU(inplace=True), + ) + + self.conv4 = nn.Sequential( + nn.Conv2d(256, 512, 3, 1, 1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, 1), + nn.ReLU(inplace=True), + ) + self.drop4 = nn.Dropout(0.5) + + self.conv5 = nn.Sequential( + nn.Conv2d(512, 1024, 3, 1, 1), + nn.ReLU(inplace=True), + nn.Conv2d(1024, 1024, 3, 1, 1), + nn.ReLU(inplace=True), + ) + self.drop5 = nn.Dropout(0.5) + + self.up6 = nn.Sequential( + nn.Upsample(scale_factor=2), + nn.ZeroPad2d((0, 1, 0, 1)), + nn.Conv2d(1024, 512, 2), + nn.ReLU(inplace=True) + ) + self.conv6 = nn.Sequential( + nn.Conv2d(1024, 512, 3, 1, 1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, 1), + nn.ReLU(inplace=True), + ) + self.up7 = nn.Sequential( + nn.Upsample(scale_factor=2), + nn.ZeroPad2d((0, 1, 0, 1)), + nn.Conv2d(512, 256, 2), + nn.ReLU(inplace=True) + ) + self.conv7 = nn.Sequential( + nn.Conv2d(512, 256, 3, 1, 1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, 1), + nn.ReLU(inplace=True), + ) + + self.up8 = nn.Sequential( + nn.Upsample(scale_factor=2), + nn.ZeroPad2d((0, 1, 0, 1)), + nn.Conv2d(256, 128, 2), + nn.ReLU(inplace=True) + ) + self.conv8 = nn.Sequential( + nn.Conv2d(256, 128, 3, 1, 1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 128, 3, 1, 1), + nn.ReLU(inplace=True), + ) + + self.up9 = nn.Sequential( + nn.Upsample(scale_factor=2), + nn.ZeroPad2d((0, 1, 0, 1)), + nn.Conv2d(128, 64, 2), + nn.ReLU(inplace=True) + ) + self.conv9 = nn.Sequential( + nn.Conv2d(128, 64, 3, 1, 1), + nn.ReLU(inplace=True), + nn.Conv2d(64, 64, 3, 1, 1), + nn.ReLU(inplace=True), + nn.Conv2d(64, 2, 3, 1, 1), + nn.ReLU(inplace=True), + ) + self.conv10 = nn.Conv2d(2, num_classes, 1) + self.__init_weights() + + def __init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + + def forward(self, x): + conv1 = self.conv1(x) + pool1 = F.max_pool2d(conv1, 2, 2) + + conv2 = self.conv2(pool1) + pool2 = F.max_pool2d(conv2, 2, 2) + + conv3 = self.conv3(pool2) + pool3 = F.max_pool2d(conv3, 2, 2) + + conv4 = self.conv4(pool3) + drop4 = self.drop4(conv4) + pool4 = F.max_pool2d(drop4, 2, 2) + + conv5 = self.conv5(pool4) + drop5 = self.drop5(conv5) + + up6 = self.up6(drop5) + merge6 = torch.cat((drop4, up6), dim=1) + conv6 = self.conv6(merge6) + + up7 = self.up7(conv6) + merge7 = torch.cat((conv3, up7), dim=1) + conv7 = self.conv7(merge7) + + up8 = self.up8(conv7) + merge8 = torch.cat((conv2, up8), dim=1) + conv8 = self.conv8(merge8) + + up9 = self.up9(conv8) + merge9 = torch.cat((conv1, up9), dim=1) + conv9 = self.conv9(merge9) + + output = self.conv10(conv9) + return output + + +if __name__ == '__main__': + model = KappaMask(num_classes=2, in_channels=3) + fake_data = torch.rand(2, 3, 256, 256) + output = model(fake_data) + print(output.shape) \ No newline at end of file diff --git a/hugging_face/cloud_adapter/mcdnet.py b/hugging_face/cloud_adapter/mcdnet.py new file mode 100644 index 0000000..93a1e0f --- /dev/null +++ b/hugging_face/cloud_adapter/mcdnet.py @@ -0,0 +1,435 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/7/21 下午3:51 +# @Author : xiaoshun +# @Email : 3038523973@qq.com +# @File : mcdnet.py +# @Software: PyCharm +import image_dehazer +import numpy as np +# 论文地址:https://www.sciencedirect.com/science/article/pii/S1569843224001742?via%3Dihub +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class _DPFF(nn.Module): + def __init__(self, in_channels) -> None: + super(_DPFF, self).__init__() + self.cbr1 = nn.Conv2d(in_channels * 2, in_channels, 1, 1, bias=False) + self.cbr2 = nn.Conv2d(in_channels * 2, in_channels, 1, 1, bias=False) + # self.sigmoid = nn.Sigmoid() + self.cbr3 = nn.Conv2d(in_channels, in_channels, 1, 1, bias=False) + self.cbr4 = nn.Conv2d(in_channels * 2, in_channels, 1, 1, bias=False) + + def forward(self, feature1, feature2): + d1 = torch.abs(feature1 - feature2) + d2 = self.cbr1(torch.cat([feature1, feature2], dim=1)) + d = torch.cat([d1, d2], dim=1) + d = self.cbr2(d) + # d = self.sigmoid(d) + + v1, v2 = self.cbr3(feature1), self.cbr3(feature2) + v1, v2 = v1 * d, v2 * d + features = torch.cat([v1, v2], dim=1) + features = self.cbr4(features) + + return features + + +class DPFF(nn.Module): + def __init__(self, layer_channels) -> None: + super(DPFF, self).__init__() + self.cfes = nn.ModuleList() + for layer_channel in layer_channels: + self.cfes.append(_DPFF(layer_channel)) + + def forward(self, features1, features2): + outputs = [] + for feature1, feature2, cfe in zip(features1, features2, self.cfes): + outputs.append(cfe(feature1, feature2)) + return outputs + + +class DirectDPFF(nn.Module): + def __init__(self, layer_channels) -> None: + super(DirectDPFF, self).__init__() + self.fusions = nn.ModuleList( + [nn.Conv2d(layer_channel * 2, layer_channel, 1, 1) for layer_channel in layer_channels] + ) + + def forward(self, features1, features2): + outputs = [] + for feature1, feature2, fusion in zip(features1, features2, self.fusions): + feature = torch.cat([feature1, feature2], dim=1) + outputs.append(fusion(feature)) + return outputs + + +class ConvBlock(nn.Module): + def __init__(self, input_size, output_size, kernel_size=4, stride=2, padding=1, bias=True, + bn=False, activation=True, maxpool=True): + super(ConvBlock, self).__init__() + self.module = [] + if maxpool: + down = nn.Sequential( + *[ + nn.MaxPool2d(2), + nn.Conv2d(input_size, output_size, 1, 1, 0, bias=bias) + ] + ) + else: + down = nn.Conv2d(input_size, output_size, kernel_size, stride, padding, bias=bias) + self.module.append(down) + if bn: + self.module.append(nn.BatchNorm2d(output_size)) + if activation: + self.module.append(nn.PReLU()) + self.module = nn.Sequential(*self.module) + + def forward(self, x): + out = self.module(x) + + return out + + +class DeconvBlock(nn.Module): + def __init__(self, input_size, output_size, kernel_size=4, stride=2, padding=1, bias=True, + bn=False, activation=True, bilinear=True): + super(DeconvBlock, self).__init__() + self.module = [] + if bilinear: + deconv = nn.Sequential( + *[ + nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True), + nn.Conv2d(input_size, output_size, 1, 1, 0, bias=bias) + ] + ) + else: + deconv = nn.ConvTranspose2d(input_size, output_size, kernel_size, stride, padding, bias=bias) + self.module.append(deconv) + if bn: + self.module.append(nn.BatchNorm2d(output_size)) + if activation: + self.module.append(nn.PReLU()) + self.module = nn.Sequential(*self.module) + + def forward(self, x): + out = self.module(x) + + return out + + +class FusionBlock(torch.nn.Module): + def __init__(self, num_filter, num_ft, kernel_size=4, stride=2, padding=1, bias=True, maxpool=False, + bilinear=False): + super(FusionBlock, self).__init__() + self.num_ft = num_ft + self.up_convs = nn.ModuleList() + self.down_convs = nn.ModuleList() + for i in range(self.num_ft): + self.up_convs.append( + DeconvBlock(num_filter // (2 ** i), num_filter // (2 ** (i + 1)), kernel_size, stride, padding, + bias=bias, bilinear=bilinear) + ) + self.down_convs.append( + ConvBlock(num_filter // (2 ** (i + 1)), num_filter // (2 ** i), kernel_size, stride, padding, bias=bias, + maxpool=maxpool) + ) + + def forward(self, ft_l, ft_h_list): + ft_fusion = ft_l + for i in range(len(ft_h_list)): + ft = ft_fusion + for j in range(self.num_ft - i): + ft = self.up_convs[j](ft) + ft = ft - ft_h_list[i] + for j in range(self.num_ft - i): + ft = self.down_convs[self.num_ft - i - j - 1](ft) + ft_fusion = ft_fusion + ft + + return ft_fusion + + +class ConvLayer(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True): + super(ConvLayer, self).__init__() + reflection_padding = kernel_size // 2 + self.reflection_pad = nn.ReflectionPad2d(reflection_padding) + self.conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride, bias=bias) + + def forward(self, x): + out = self.reflection_pad(x) + out = self.conv2d(out) + return out + + +class UpsampleConvLayer(torch.nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride): + super(UpsampleConvLayer, self).__init__() + self.conv2d = nn.ConvTranspose2d(in_channels, out_channels, kernel_size, stride=stride) + + def forward(self, x): + out = self.conv2d(x) + return out + + +class AddRelu(nn.Module): + """It is for adding two feed forwards to the output of the two following conv layers in expanding path + """ + + def __init__(self) -> None: + super(AddRelu, self).__init__() + self.relu = nn.PReLU() + + def forward(self, input_tensor1, input_tensor2, input_tensor3): + x = input_tensor1 + input_tensor2 + input_tensor3 + return self.relu(x) + + +class BasicBlock(nn.Module): + def __init__(self, in_channels, out_channels, mid_channels=None): + super(BasicBlock, self).__init__() + if not mid_channels: + mid_channels = out_channels + self.conv1 = ConvLayer(in_channels, mid_channels, kernel_size=3, stride=1) + self.bn1 = nn.BatchNorm2d(mid_channels, momentum=0.1) + self.relu = nn.PReLU() + + self.conv2 = ConvLayer(mid_channels, out_channels, kernel_size=3, stride=1) + self.bn2 = nn.BatchNorm2d(out_channels, momentum=0.1) + + self.conv3 = ConvLayer(in_channels, out_channels, kernel_size=1, stride=1) + + def forward(self, x): + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + residual = self.conv3(x) + + out = out + residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + def __init__(self, in_channels, out_channels): + super(Bottleneck, self).__init__() + self.conv1 = ConvLayer(in_channels, out_channels, kernel_size=3, stride=1) + self.bn1 = nn.BatchNorm2d(out_channels, momentum=0.1) + + self.conv2 = ConvLayer(out_channels, out_channels, kernel_size=3, stride=1) + self.bn2 = nn.BatchNorm2d(out_channels, momentum=0.1) + + self.conv3 = ConvLayer(out_channels, out_channels, kernel_size=3, stride=1) + self.bn3 = nn.BatchNorm2d(out_channels, momentum=0.1) + + self.conv4 = ConvLayer(in_channels, out_channels, kernel_size=1, stride=1) + + self.relu = nn.PReLU() + + def forward(self, x): + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + residual = self.conv4(x) + + out = out + residual + out = self.relu(out) + + return out + + +class PPM(nn.Module): + def __init__(self, in_channels, out_channels): + super(PPM, self).__init__() + + self.pool_sizes = [1, 2, 3, 6] # subregion size in each level + self.num_levels = len(self.pool_sizes) # number of pyramid levels + + self.conv_layers = nn.ModuleList() + for i in range(self.num_levels): + self.conv_layers.append(nn.Sequential( + nn.AdaptiveAvgPool2d(output_size=self.pool_sizes[i]), + nn.Conv2d(in_channels, in_channels // self.num_levels, kernel_size=1), + nn.BatchNorm2d(in_channels // self.num_levels), + nn.ReLU(inplace=True) + )) + self.out_conv = nn.Conv2d(in_channels * 2, out_channels, kernel_size=1, stride=1) + + def forward(self, x): + input_size = x.size()[2:] # get input size + output = [x] + + # pyramid pooling + for i in range(self.num_levels): + out = self.conv_layers[i](x) + out = F.interpolate(out, size=input_size, mode='bilinear', align_corners=True) + output.append(out) + + # concatenate features from different levels + output = torch.cat(output, dim=1) + output = self.out_conv(output) + + return output + + +class MCDNet(nn.Module): + def __init__(self, in_channels=4, num_classes=4, maxpool=False, bilinear=False) -> None: + super().__init__() + level = 1 + # encoder + self.conv_input = ConvLayer(in_channels, 32 * level, kernel_size=3, stride=2) + + self.dense0 = BasicBlock(32 * level, 32 * level) + self.conv2x = ConvLayer(32 * level, 64 * level, kernel_size=3, stride=2) + + self.dense1 = BasicBlock(64 * level, 64 * level) + self.conv4x = ConvLayer(64 * level, 128 * level, kernel_size=3, stride=2) + + self.dense2 = BasicBlock(128 * level, 128 * level) + self.conv8x = ConvLayer(128 * level, 256 * level, kernel_size=3, stride=2) + + self.dense3 = BasicBlock(256 * level, 256 * level) + self.conv16x = ConvLayer(256 * level, 512 * level, kernel_size=3, stride=2) + + self.dense4 = PPM(512 * level, 512 * level) + + # dpff + self.dpffm = DPFF([32, 64, 128, 256, 512]) + + # decoder + self.convd16x = UpsampleConvLayer(512 * level, 256 * level, kernel_size=3, stride=2) + self.fusion4 = FusionBlock(256 * level, 3, maxpool=maxpool, bilinear=bilinear) + self.dense_4 = Bottleneck(512 * level, 256 * level) + self.add_block4 = AddRelu() + + self.convd8x = UpsampleConvLayer(256 * level, 128 * level, kernel_size=3, stride=2) + self.fusion3 = FusionBlock(128 * level, 2, maxpool=maxpool, bilinear=bilinear) + self.dense_3 = Bottleneck(256 * level, 128 * level) + self.add_block3 = AddRelu() + + self.convd4x = UpsampleConvLayer(128 * level, 64 * level, kernel_size=3, stride=2) + self.fusion2 = FusionBlock(64 * level, 1, maxpool=maxpool, bilinear=bilinear) + self.dense_2 = Bottleneck(128 * level, 64 * level) + self.add_block2 = AddRelu() + + self.convd2x = UpsampleConvLayer(64 * level, 32 * level, kernel_size=3, stride=2) + self.dense_1 = Bottleneck(64 * level, 32 * level) + self.add_block1 = AddRelu() + + self.head = UpsampleConvLayer(32 * level, num_classes, kernel_size=3, stride=2) + self.apply(self._weights_init) + + @torch.no_grad() + def get_lr_data(self, x: torch.Tensor) -> torch.Tensor: + images = x.cpu().permute(0, 2, 3, 1).numpy() # b, h, w, c + batch_size = images.shape[0] + lr = [] + for i in range(batch_size): + lr_image = image_dehazer.remove_haze((images[i]*255).astype(np.uint8), showHazeTransmissionMap=False)[0] # h, w, c, numpy.array + lr_tensor = torch.from_numpy(lr_image).permute(2, 0, 1)/255. # c, h, w + lr.append(lr_tensor) + return torch.stack(lr, dim=0).to(x.device) # b, c, h, w + + def _weights_init(self, m): + if isinstance(m, nn.Linear): + nn.init.xavier_normal_(m.weight) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x1): + x2 = self.get_lr_data(x1) + # encoder1 + res1x_1 = self.conv_input(x1) + res1x_1 = self.dense0(res1x_1) + + res2x_1 = self.conv2x(res1x_1) + res2x_1 = self.dense1(res2x_1) + + res4x_1 = self.conv4x(res2x_1) + res4x_1 = self.dense2(res4x_1) + + res8x_1 = self.conv8x(res4x_1) + res8x_1 = self.dense3(res8x_1) + + res16x_1 = self.conv16x(res8x_1) + res16x_1 = self.dense4(res16x_1) + + # encoder2 + res1x_2 = self.conv_input(x2) + res1x_2 = self.dense0(res1x_2) + + res2x_2 = self.conv2x(res1x_2) + res2x_2 = self.dense1(res2x_2) + + res4x_2 = self.conv4x(res2x_2) + res4x_2 = self.dense2(res4x_2) + + res8x_2 = self.conv8x(res4x_2) + res8x_2 = self.dense3(res8x_2) + + res16x_2 = self.conv16x(res8x_2) + res16x_2 = self.dense4(res16x_2) + + # dual-perspective feature fusion + res1x, res2x, res4x, res8x, res16x = self.dpffm( + [res1x_1, res2x_1, res4x_1, res8x_1, res16x_1], + [res1x_2, res2x_2, res4x_2, res8x_2, res16x_2] + ) + + # decoder + res8x1 = self.convd16x(res16x) + res8x1 = F.interpolate(res8x1, res8x.size()[2:], mode='bilinear') + res8x2 = self.fusion4(res8x, [res1x, res2x, res4x]) + res8x2 = torch.cat([res8x1, res8x2], dim=1) + res8x2 = self.dense_4(res8x2) + res8x2 = self.add_block4(res8x1, res8x, res8x2) + + res4x1 = self.convd8x(res8x2) + res4x1 = F.interpolate(res4x1, res4x.size()[2:], mode='bilinear') + res4x2 = self.fusion3(res4x, [res1x, res2x]) + res4x2 = torch.cat([res4x1, res4x2], dim=1) + res4x2 = self.dense_3(res4x2) + res4x2 = self.add_block3(res4x1, res4x, res4x2) + + res2x1 = self.convd4x(res4x2) + res2x1 = F.interpolate(res2x1, res2x.size()[2:], mode='bilinear') + res2x2 = self.fusion2(res2x, [res1x]) + res2x2 = torch.cat([res2x1, res2x2], dim=1) + res2x2 = self.dense_2(res2x2) + res2x2 = self.add_block2(res2x1, res2x, res2x2) + + res1x1 = self.convd2x(res2x2) + res1x1 = F.interpolate(res1x1, res1x.size()[2:], mode='bilinear') + res1x2 = torch.cat([res1x1, res1x], dim=1) + res1x2 = self.dense_1(res1x2) + res1x2 = self.add_block1(res1x1, res1x, res1x2) + + out = self.head(res1x2) + out = F.interpolate(out, x1.size()[2:], mode='bilinear') + + return out + + +if __name__ == "__main__": + num_classes = 2 + model = MCDNet() + # inp = torch.randn(size=(2, 3, 256, 256)) + # assert model(input).shape == (2, 2, 256, 256) \ No newline at end of file diff --git a/hugging_face/cloud_adapter/scnn.py b/hugging_face/cloud_adapter/scnn.py new file mode 100644 index 0000000..b5bd5d6 --- /dev/null +++ b/hugging_face/cloud_adapter/scnn.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/7/21 下午5:11 +# @Author : xiaoshun +# @Email : 3038523973@qq.com +# @File : scnn.py +# @Software: PyCharm + +# 论文地址:https://www.sciencedirect.com/science/article/abs/pii/S0924271624000352?via%3Dihub#fn1 + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SCNN(nn.Module): + def __init__(self, in_channels=3, num_classes=2, dropout_p=0.5): + super().__init__() + self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=1) + self.conv2 = nn.Conv2d(64, num_classes, kernel_size=1) + self.conv3 = nn.Conv2d(num_classes, num_classes, kernel_size=3, padding=1) + self.dropout = nn.Dropout2d(p=dropout_p) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = self.dropout(x) + x = self.conv2(x) + x = self.conv3(x) + return x + + +if __name__ == '__main__': + model = SCNN(num_classes=7) + fake_img = torch.randn((2, 3, 224, 224)) + out = model(fake_img) + print(out.shape) + # torch.Size([2, 7, 224, 224]) \ No newline at end of file diff --git a/hugging_face/cloud_adapter/unetmobv2.py b/hugging_face/cloud_adapter/unetmobv2.py new file mode 100644 index 0000000..3ae7824 --- /dev/null +++ b/hugging_face/cloud_adapter/unetmobv2.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/8/6 下午3:44 +# @Author : xiaoshun +# @Email : 3038523973@qq.com +# @File : unetmobv2.py +# @Software: PyCharm +import segmentation_models_pytorch as smp +import torch +from torch import nn as nn + + +class UNetMobV2(nn.Module): + def __init__(self,num_classes,in_channels=3): + super().__init__() + self.backbone = smp.Unet( + encoder_name='mobilenet_v2', + encoder_weights=None, + in_channels=in_channels, + classes=num_classes, + ) + + def forward(self, x): + x = self.backbone(x) + return x + + +if __name__ == '__main__': + fake_image = torch.rand(1, 3, 224, 224) + model = UNetMobV2(num_classes=2) + output = model(fake_image) + print(output.size()) \ No newline at end of file diff --git a/hugging_face/cloud_adapter/utils.py b/hugging_face/cloud_adapter/utils.py new file mode 100644 index 0000000..0367331 --- /dev/null +++ b/hugging_face/cloud_adapter/utils.py @@ -0,0 +1,58 @@ +import torch.nn as nn +from typing import List +from mmengine.logging import MMLogger + +first_set_requires_grad = True +first_set_train = True + + +def set_requires_grad(model: nn.Module, keywords: List[str]): + """ + notice:key in name! + """ + requires_grad_names = [] + num_params = 0 + num_trainable = 0 + for name, param in model.named_parameters(): + num_params += param.numel() + if any(key in name for key in keywords): + param.requires_grad = True + requires_grad_names.append(name) + num_trainable += param.numel() + else: + param.requires_grad = False + global first_set_requires_grad + if first_set_requires_grad: + logger = MMLogger.get_current_instance() + for name in requires_grad_names: + logger.info(f"set_requires_grad----{name}") + logger.info( + f"Total trainable params--{num_trainable}, All params--{num_params}, Ratio--{num_trainable*100/num_params:.1f}%" + ) + first_set_requires_grad = False + + +def _set_train(model: nn.Module, keywords: List[str], prefix: str = ""): + train_names = [] + for name, child in model.named_children(): + fullname = ".".join([prefix, name]) + if any(name.startswith(key) for key in keywords): + train_names.append(fullname) + child.train() + else: + train_names += _set_train(child, keywords, prefix=fullname) + return train_names + + +def set_train(model: nn.Module, keywords: List[str]): + """ + notice:sub name startwith key! + """ + model.train(False) + train_names = _set_train(model, keywords) + global first_set_train + if first_set_train: + logger = MMLogger.get_current_instance() + for train_name in train_names: + logger.info(f"set_train----{train_name}") + first_set_train = False \ No newline at end of file diff --git a/hugging_face/example_inputs/gf1/11.png b/hugging_face/example_inputs/gf1/11.png new file mode 100644 index 0000000..f140af3 Binary files /dev/null and b/hugging_face/example_inputs/gf1/11.png differ diff --git a/hugging_face/example_inputs/gf1/48.png b/hugging_face/example_inputs/gf1/48.png new file mode 100644 index 0000000..1a2ef6d Binary files /dev/null and b/hugging_face/example_inputs/gf1/48.png differ diff --git a/hugging_face/example_inputs/gf1/9.png b/hugging_face/example_inputs/gf1/9.png new file mode 100644 index 0000000..a206fea Binary files /dev/null and b/hugging_face/example_inputs/gf1/9.png differ diff --git a/hugging_face/example_inputs/gf2/160.png b/hugging_face/example_inputs/gf2/160.png new file mode 100644 index 0000000..3d5df50 Binary files /dev/null and b/hugging_face/example_inputs/gf2/160.png differ diff --git a/hugging_face/example_inputs/gf2/2.png b/hugging_face/example_inputs/gf2/2.png new file mode 100644 index 0000000..b25bcb2 Binary files /dev/null and b/hugging_face/example_inputs/gf2/2.png differ diff --git a/hugging_face/example_inputs/gf2/63.png b/hugging_face/example_inputs/gf2/63.png new file mode 100644 index 0000000..ad831a8 Binary files /dev/null and b/hugging_face/example_inputs/gf2/63.png differ diff --git a/hugging_face/example_inputs/hrc_whu/barren_7.png b/hugging_face/example_inputs/hrc_whu/barren_7.png new file mode 100644 index 0000000..d0fb526 Binary files /dev/null and b/hugging_face/example_inputs/hrc_whu/barren_7.png differ diff --git a/hugging_face/example_inputs/hrc_whu/snow_10.png b/hugging_face/example_inputs/hrc_whu/snow_10.png new file mode 100644 index 0000000..af123db Binary files /dev/null and b/hugging_face/example_inputs/hrc_whu/snow_10.png differ diff --git a/hugging_face/example_inputs/hrc_whu/vegetation_21.png b/hugging_face/example_inputs/hrc_whu/vegetation_21.png new file mode 100644 index 0000000..6ea92b4 Binary files /dev/null and b/hugging_face/example_inputs/hrc_whu/vegetation_21.png differ diff --git a/hugging_face/example_inputs/l1c/1.png b/hugging_face/example_inputs/l1c/1.png new file mode 100644 index 0000000..f36ecf6 Binary files /dev/null and b/hugging_face/example_inputs/l1c/1.png differ diff --git a/hugging_face/example_inputs/l1c/27.png b/hugging_face/example_inputs/l1c/27.png new file mode 100644 index 0000000..96a3bc2 Binary files /dev/null and b/hugging_face/example_inputs/l1c/27.png differ diff --git a/hugging_face/example_inputs/l1c/76.png b/hugging_face/example_inputs/l1c/76.png new file mode 100644 index 0000000..c06811d Binary files /dev/null and b/hugging_face/example_inputs/l1c/76.png differ diff --git a/hugging_face/example_inputs/l2a/121.png b/hugging_face/example_inputs/l2a/121.png new file mode 100644 index 0000000..7d9d4cb Binary files /dev/null and b/hugging_face/example_inputs/l2a/121.png differ diff --git a/hugging_face/example_inputs/l2a/18.png b/hugging_face/example_inputs/l2a/18.png new file mode 100644 index 0000000..903d2f1 Binary files /dev/null and b/hugging_face/example_inputs/l2a/18.png differ diff --git a/hugging_face/example_inputs/l2a/35.png b/hugging_face/example_inputs/l2a/35.png new file mode 100644 index 0000000..9fa0521 Binary files /dev/null and b/hugging_face/example_inputs/l2a/35.png differ diff --git a/hugging_face/example_inputs/l8/barren_LC81390292014135LGN00_patch_5632_512.png b/hugging_face/example_inputs/l8/barren_LC81390292014135LGN00_patch_5632_512.png new file mode 100644 index 0000000..278dafd Binary files /dev/null and b/hugging_face/example_inputs/l8/barren_LC81390292014135LGN00_patch_5632_512.png differ diff --git a/hugging_face/example_inputs/l8/forest_LC80160502014041LGN00_patch_4608_4608.png b/hugging_face/example_inputs/l8/forest_LC80160502014041LGN00_patch_4608_4608.png new file mode 100644 index 0000000..4ba4471 Binary files /dev/null and b/hugging_face/example_inputs/l8/forest_LC80160502014041LGN00_patch_4608_4608.png differ diff --git a/hugging_face/example_inputs/l8/shrubland_LC81020802014100LGN00_patch_1024_3584.png b/hugging_face/example_inputs/l8/shrubland_LC81020802014100LGN00_patch_1024_3584.png new file mode 100644 index 0000000..2d60435 Binary files /dev/null and b/hugging_face/example_inputs/l8/shrubland_LC81020802014100LGN00_patch_1024_3584.png differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d3d1452 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +numpy +ftfy +scipy +prettytable +matplotlib +regex +timm +einops \ No newline at end of file diff --git a/tools/convert_datasets/cityscapes.py b/tools/convert_datasets/cityscapes.py new file mode 100644 index 0000000..1da0238 --- /dev/null +++ b/tools/convert_datasets/cityscapes.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp + +import mmengine +from cityscapesscripts.preparation.json2labelImg import json2labelImg + + +def convert_json_to_label(json_file): + label_file = json_file.replace("_polygons.json", "_labelTrainIds.png") + json2labelImg(json_file, label_file, "trainIds") + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Convert Cityscapes annotations to TrainIds" + ) + parser.add_argument("cityscapes_path", help="cityscapes data path") + parser.add_argument("--gt-dir", default="gtFine", type=str) + parser.add_argument("-o", "--out-dir", help="output path") + parser.add_argument("--nproc", default=8, type=int, help="number of process") + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + cityscapes_path = args.cityscapes_path + out_dir = args.out_dir if args.out_dir else cityscapes_path + mmengine.mkdir_or_exist(out_dir) + + gt_dir = osp.join(cityscapes_path, args.gt_dir) + + poly_files = [] + for poly in mmengine.scandir(gt_dir, "_polygons.json", recursive=True): + poly_file = osp.join(gt_dir, poly) + poly_files.append(poly_file) + if args.nproc > 1: + mmengine.track_parallel_progress(convert_json_to_label, poly_files, args.nproc) + else: + mmengine.track_progress(convert_json_to_label, poly_files) + + split_names = ["train", "val", "test"] + + for split in split_names: + filenames = [] + for poly in mmengine.scandir( + osp.join(gt_dir, split), "_polygons.json", recursive=True + ): + filenames.append(poly.replace("_gtFine_polygons.json", "")) + with open(osp.join(out_dir, f"{split}.txt"), "w") as f: + f.writelines(f + "\n" for f in filenames) + + +if __name__ == "__main__": + main() diff --git a/tools/convert_datasets/create_gf12.py b/tools/convert_datasets/create_gf12.py new file mode 100644 index 0000000..45ae511 --- /dev/null +++ b/tools/convert_datasets/create_gf12.py @@ -0,0 +1,80 @@ +import os +from PIL import Image +import numpy as np +from dataset.gf12ms_whu import GF12MSWHU +import albumentations +from tqdm import tqdm +import sys +import argparse + + +def get_args(): + parse = argparse.ArgumentParser() + parse.add_argument( + "--root", + type=str, + help="gf12数据集路径", + default="/data/zouxuechao/cloudseg/gf12ms_whu", + ) + parse.add_argument( + "--save_path", type=str, help="数据集保存路径", default="/data/zouxuechao/mmseg" + ) + args = parse.parse_args() + return args.root, args.save_path + + +def imgRGB2P(ann, dst_path): + # 定义调色板的索引 + bin_colormap_reverse = np.array([[0, 0, 0], [255, 255, 255]]).astype( + np.uint8 + ) # 会按照值的循序进行索引 + # 转化为p模式 + img_p = ann.convert("P") + img_p.putpalette(bin_colormap_reverse) + img_p.save(dst_path) + + +def get_dataset(root,save_path,phase, serial): + all_transform = albumentations.PadIfNeeded( + min_height=256, min_width=256, p=1, always_apply=True + ) + dataset = GF12MSWHU( + root=root, phase=phase, serial=serial, all_transform=all_transform + ) + child_root = "gf12ms_whu_gf1" if serial == "gf1" else "gf12ms_whu_gf2" + os.makedirs(os.path.join(save_path,child_root, "img_dir", phase), exist_ok=True) + os.makedirs(os.path.join(save_path,child_root, "ann_dir", phase), exist_ok=True) + + for data in tqdm( + dataset, total=len(dataset), desc=f"{serial}-{phase} processing..." + ): + img_path = data["img_path"] + filename = img_path.split(os.path.sep)[-1] + + filename = filename.replace("tiff", "png") + + ann = data["ann"].astype(np.uint8) + + img = data["img"] + + img = (img * 255).astype(np.uint8) + + img = Image.fromarray(img) + ann = Image.fromarray(ann) + + img.save(os.path.join(save_path,child_root, "img_dir", phase, filename)) + imgRGB2P(ann, os.path.join(save_path,child_root, "ann_dir", phase, filename)) + + +if __name__ == "__main__": + # use example:PYTHONPATH=$PYTHONPATH:. python tools/convert_datasets/create_gf12.py --root /data/zouxuechao/cloudseg/gf12ms_whu --save_path /data/zouxuechao/mmseg + root,save_path = get_args() + for phase in ["train","val"]: + for serial in ["gf1","gf2"]: + get_dataset(root,save_path,phase,serial) + + # get_dataset("train",serial="gf1") + # get_dataset("val",serial="gf1") + + # get_dataset("train",serial="gf2") + # get_dataset("val",serial="gf2") diff --git a/tools/convert_datasets/create_l1c_l2a.py b/tools/convert_datasets/create_l1c_l2a.py new file mode 100644 index 0000000..f9743ff --- /dev/null +++ b/tools/convert_datasets/create_l1c_l2a.py @@ -0,0 +1,59 @@ +import os +from PIL import Image +import numpy as np +from dataset.cloudsen12_high import CloudSEN12High +from tqdm import tqdm +import sys +import argparse + + +def get_args(): + parse = argparse.ArgumentParser() + parse.add_argument( + "--root", + type=str, + help="cloudseghigh数据集路径", + default="/data/zouxuechao/cloudseg/cloudsen12_high", + ) + parse.add_argument( + "--save_path", type=str, help="数据集保存路径", default="/data/zouxuechao/mmseg" + ) + args = parse.parse_args() + return args.root, args.save_path + + + +def get_dataset(root,save_path,phase,level): + dataset = CloudSEN12High( + root=root, phase=phase,level=level + ) + child_root = "cloudsen12_high_l1c" if level == "l1c" else "cloudsen12_high_l2a" + os.makedirs(os.path.join(save_path,child_root, "img_dir", phase), exist_ok=True) + os.makedirs(os.path.join(save_path,child_root, "ann_dir", phase), exist_ok=True) + index = 0 + for data in tqdm( + dataset, total=len(dataset), desc=f"cloudsen12_high-{level}-{phase} processing..." + ): + + filename = f"{index}.png" + + ann = data["ann"].astype(np.uint8) + + img = data["img"] + + img = (img * 255).astype(np.uint8) + + img = Image.fromarray(img) + ann = Image.fromarray(ann) + + img.save(os.path.join(save_path,child_root, "img_dir", phase, filename)) + ann.save(os.path.join(save_path,child_root, "ann_dir", phase, filename)) + + index += 1 + +if __name__ == "__main__": + # use example: PYTHONPATH=$PYTHONPATH:. python tools/convert_datasets/create_l1c_l2a.py --root /data/zouxuechao/cloudseg/cloudsen12_high --save_path /data/zouxuechao/mmseg + root,save_path = get_args() + for phase in ["train","val","test"]: + for level in ["l1c","l2a"]: + get_dataset(root,save_path,phase,level) diff --git a/tools/convert_datasets/create_l8.py b/tools/convert_datasets/create_l8.py new file mode 100644 index 0000000..a160c1b --- /dev/null +++ b/tools/convert_datasets/create_l8.py @@ -0,0 +1,60 @@ +import os +from PIL import Image +import numpy as np +from dataset.l8_biome_crop import L8BiomeCrop +from tqdm import tqdm +import sys +import argparse + + +def get_args(): + parse = argparse.ArgumentParser() + parse.add_argument( + "--root", + type=str, + help="l8_biome数据集路径", + default="/home/zouxuechao/zs/rein/data/l8_biome_crop", + ) + parse.add_argument( + "--save_path", type=str, help="数据集保存路径", default="/data/zouxuechao/mmseg" + ) + args = parse.parse_args() + return args.root, args.save_path + + + +def get_dataset(root,save_path,phase): + dataset = L8BiomeCrop( + root=root, phase=phase + ) + child_root = "l8_biome" + os.makedirs(os.path.join(save_path,child_root, "img_dir", phase), exist_ok=True) + os.makedirs(os.path.join(save_path,child_root, "ann_dir", phase), exist_ok=True) + + for data in tqdm( + dataset, total=len(dataset), desc=f"l8_biome-{phase} processing..." + ): + img_path = data["img_path"] + filename = img_path.split(os.path.sep)[-1] + + filename = filename.split(".")[0] + + filename = filename + ".png" + + ann = data["ann"].astype(np.uint8) + + img = data["img"] + + img = (img * 255).astype(np.uint8) + + img = Image.fromarray(img) + ann = Image.fromarray(ann) + + img.save(os.path.join(save_path,child_root, "img_dir", phase, filename)) + ann.save(os.path.join(save_path,child_root, "ann_dir", phase, filename)) + +if __name__ == "__main__": + # use example: PYTHONPATH=$PYTHONPATH:. python tools/convert_datasets/create_l8.py --root /home/zouxuechao/zs/rein/data/l8_biome_crop --save_path /data/zouxuechao/mmseg + root,save_path = get_args() + for phase in ["train","val","test"]: + get_dataset(root,save_path,phase) diff --git a/tools/convert_datasets/gta.py b/tools/convert_datasets/gta.py new file mode 100644 index 0000000..792f20a --- /dev/null +++ b/tools/convert_datasets/gta.py @@ -0,0 +1,114 @@ +# Obtained from: https://github.com/lhoyer/DAFormer +# Aiming to convert the annotation format to be TrainId + +import argparse +import json +import os.path as osp + +import mmengine +import numpy as np +from PIL import Image + + +def convert_to_train_id(file): + # re-assign labels to match the format of Cityscapes dataset + pil_label = Image.open(file) + label = np.asarray(pil_label) + id_to_trainid = { + 7: 0, + 8: 1, + 11: 2, + 12: 3, + 13: 4, + 17: 5, + 19: 6, + 20: 7, + 21: 8, + 22: 9, + 23: 10, + 24: 11, + 25: 12, + 26: 13, + 27: 14, + 28: 15, + 31: 16, + 32: 17, + 33: 18 + } + label_copy = 255 * np.ones(label.shape, dtype=np.uint8) + sample_class_stats = {} + for k, v in id_to_trainid.items(): + k_mask = label == k + label_copy[k_mask] = v + n = int(np.sum(k_mask)) + if n > 0: + sample_class_stats[v] = n + new_file = file.replace('.png', '_labelTrainIds.png') + assert file != new_file + sample_class_stats['file'] = new_file + Image.fromarray(label_copy, mode='L').save(new_file) + return sample_class_stats + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Convert GTA annotations to TrainIds') + parser.add_argument('gta_path', help='gta data path') + parser.add_argument('--gt-dir', default='labels', type=str) + parser.add_argument('-o', '--out-dir', help='output path') + parser.add_argument( + '--nproc', default=4, type=int, help='number of process') + args = parser.parse_args() + return args + + +def save_class_stats(out_dir, sample_class_stats): + with open(osp.join(out_dir, 'sample_class_stats.json'), 'w') as of: + json.dump(sample_class_stats, of, indent=2) + + sample_class_stats_dict = {} + for stats in sample_class_stats: + f = stats.pop('file') + sample_class_stats_dict[f] = stats + with open(osp.join(out_dir, 'sample_class_stats_dict.json'), 'w') as of: + json.dump(sample_class_stats_dict, of, indent=2) + + samples_with_class = {} + for file, stats in sample_class_stats_dict.items(): + for c, n in stats.items(): + if c not in samples_with_class: + samples_with_class[c] = [(file, n)] + else: + samples_with_class[c].append((file, n)) + with open(osp.join(out_dir, 'samples_with_class.json'), 'w') as of: + json.dump(samples_with_class, of, indent=2) + + +def main(): + args = parse_args() + gta_path = args.gta_path + out_dir = args.out_dir if args.out_dir else gta_path + mmengine.mkdir_or_exist(out_dir) + + gt_dir = osp.join(gta_path, args.gt_dir) + + poly_files = [] + for poly in mmengine.scandir( + gt_dir, suffix=tuple(f'{i}.png' for i in range(10)), + recursive=True): + poly_file = osp.join(gt_dir, poly) + poly_files.append(poly_file) + poly_files = sorted(poly_files) + + if args.nproc > 1: + sample_class_stats = mmengine.track_parallel_progress( + convert_to_train_id, poly_files, args.nproc) + else: + sample_class_stats = mmengine.track_progress(convert_to_train_id, + poly_files) + + save_class_stats(out_dir, sample_class_stats) + + +if __name__ == '__main__': + main() diff --git a/tools/convert_datasets/mapillary2cityscape.py b/tools/convert_datasets/mapillary2cityscape.py new file mode 100644 index 0000000..0da6dc0 --- /dev/null +++ b/tools/convert_datasets/mapillary2cityscape.py @@ -0,0 +1,217 @@ +# Obtained from: https://github.com/openseg-group/openseg.pytorch.git +# Aiming to convert the annotation format to be TrainId and match the classes in Cityscapes dataset + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import json +import os + +import PIL.Image as Image +import cv2 +import numpy as np + +LABEL_DIR = "label" + + +def convert_to_train_id(trans_idx, train_mask_folder, train_label_dir, filename): + if filename.endswith(".png"): + maskpath = os.path.join(train_mask_folder, filename) + if os.path.isfile(maskpath): + mask = np.asarray(Image.open(maskpath)) + mask = trans_idx[mask] + cv2.imwrite( + os.path.join(train_label_dir, filename), + mask.astype(np.uint8), + ) + else: + print("cannot find the mask:", maskpath) + + +class MapillaryGenerator(object): + def __init__(self, args, label_dir=LABEL_DIR): + self.args = args + self.version = args.version + self.train_label_dir = os.path.join(self.args.save_dir, "train", label_dir) + self.val_label_dir = os.path.join(self.args.save_dir, "val", label_dir) + if not os.path.exists(self.train_label_dir): + os.makedirs(self.train_label_dir) + + if not os.path.exists(self.val_label_dir): + os.makedirs(self.val_label_dir) + + def generate_label(self): + trans_idx = self.get_trans_idx() + + # train_img_folder = os.path.join(self.args.ori_root_dir, 'images/training') + train_mask_folder = os.path.join( + self.args.ori_root_dir, f"training/{self.version}/labels" + ) + + # val_img_folder = os.path.join(self.args.ori_root_dir, 'images/validation') + val_mask_folder = os.path.join( + self.args.ori_root_dir, f"validation/{self.version}/labels" + ) + for filename in os.listdir(train_mask_folder): + print(filename) + convert_to_train_id( + trans_idx, train_mask_folder, self.train_label_dir, filename + ) + for filename in os.listdir(val_mask_folder): + print(filename) + convert_to_train_id( + trans_idx, val_mask_folder, self.val_label_dir, filename + ) + + def get_trans_idx(self): + # class name and index of cityscapes dataset + # [7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33], + if not self.args.train_id: + class_name_dict = { + 7: "road", + 8: "sidewalk", + 11: "building", + 12: "wall", + 13: "fence", + 17: "pole", + 19: "trafficlight", + 20: "trafficsign", + 21: "vegetation", + 22: "terrain", + 23: "sky", + 24: "person", + 25: "rider", + 26: "car", + 27: "truck", + 28: "bus", + 31: "train", + 32: "motorcycle", + 33: "bicycle", + } + else: + class_name_dict = { + 0: "road", + 1: "sidewalk", + 2: "building", + 3: "wall", + 4: "fence", + 5: "pole", + 6: "trafficlight", + 7: "trafficsign", + 8: "vegetation", + 9: "terrain", + 10: "sky", + 11: "person", + 12: "rider", + 13: "car", + 14: "truck", + 15: "bus", + 16: "train", + 17: "motorcycle", + 18: "bicycle", + } + class_name_dict = {v: k for k, v in class_name_dict.items()} + + # class name and index of mapillary dataset + with open( + os.path.join(self.args.ori_root_dir, f"config_{self.version}.json") + ) as config_file: + labels = json.load(config_file)["labels"] + + print("Following classes are mapped to corresponding classes in cityscapes:") + mapillary2city = [255] * len(labels) + ignored = [] + + for label_id, label in enumerate(labels): + name = label["readable"].lower().replace(" ", "").replace("-", "") + if name in class_name_dict.keys(): + mapillary2city[label_id] = class_name_dict[name] + print( + "{} => {}: {} => {}".format( + name, name, label_id, class_name_dict[name] + ) + ) + elif "trafficsign" in name or "front" in name or "back" in name: + mapillary2city[label_id] = class_name_dict["trafficsign"] + print( + "{} => {}: {} => {}".format( + name, "traffic sign", label_id, class_name_dict["trafficsign"] + ) + ) + elif "onrail" in name: + mapillary2city[label_id] = class_name_dict["train"] + print( + "{} => {}: {} => {}".format( + name, "train", label_id, class_name_dict["train"] + ) + ) + elif "cyclist" in name or "rider" in name: + mapillary2city[label_id] = class_name_dict["rider"] + print( + "{} => {}: {} => {}".format( + name, "rider", label_id, class_name_dict["rider"] + ) + ) + elif "pole" in name or "streetlight" in name: + mapillary2city[label_id] = class_name_dict["pole"] + print( + "{} => {}: {} => {}".format( + name, "pole", label_id, class_name_dict["pole"] + ) + ) + elif "curb" in name or "pedestrianarea" in name: + mapillary2city[label_id] = class_name_dict["sidewalk"] + print( + "{} => {}: {} => {}".format( + name, "sidewalk", label_id, class_name_dict["sidewalk"] + ) + ) + elif ( + "crosswalkplain" in name + or "parking" in name + or "bikelane" in name + or "servicelane" in name + or "lanemarking" in name + ): + mapillary2city[label_id] = class_name_dict["road"] + print( + "{} => {}: {} => {}".format( + name, "road", label_id, class_name_dict["road"] + ) + ) + else: + ignored.append(name) + + print("\nFollowing classes are mapped to void class:") + print(ignored) + return np.asarray(mapillary2city, dtype=np.uint8) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "ori_root_dir", + type=str, + help="The directory of the cityscapes data.", + ) + parser.add_argument( + "save_dir", + type=str, + help="The directory to save the data.", + ) + parser.add_argument( + "--version", + default="v1.2", + type=str, + ) + parser.add_argument( + "--train_id", + action="store_true", + ) + + args = parser.parse_args() + + Mapillary_generator = MapillaryGenerator(args) + Mapillary_generator.generate_label() diff --git a/tools/convert_datasets/mapillary_resize.py b/tools/convert_datasets/mapillary_resize.py new file mode 100644 index 0000000..99ab6ec --- /dev/null +++ b/tools/convert_datasets/mapillary_resize.py @@ -0,0 +1,67 @@ +# Aiming to resize the validation set for efficient online evaluation + +import argparse +import os.path as osp + +import mmengine +from PIL import Image + + +def resize_half(args): + ( + img_id, + image_folder, + label_folder, + dst_image_folder, + dst_label_folder, + ) = args + im_file = osp.join(image_folder, f"{img_id}.jpg") + label_file = osp.join(label_folder, f"{img_id}.png") + dst_im_file = osp.join(dst_image_folder, f"{img_id}.jpg") + dst_label_file = osp.join(dst_label_folder, f"{img_id}.png") + im = Image.open(im_file) + h, w = im.size + im = im.resize((h // 2, w // 2), resample=Image.BICUBIC) + im.save(dst_im_file) + label = Image.open(label_file) + label = label.resize((h // 2, w // 2), resample=Image.NEAREST) + label.save(dst_label_file) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("image_folder") + parser.add_argument("label_folder") + parser.add_argument("dst_image_folder") + parser.add_argument("dst_label_folder") + parser.add_argument("--nproc", default=8, type=int, help="number of process") + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + mmengine.mkdir_or_exist(args.dst_image_folder) + mmengine.mkdir_or_exist(args.dst_label_folder) + imgs = [] + for filename in mmcv.scandir(args.image_folder, suffix=".jpg"): + id = osp.splitext(filename)[0] + imgs.append(id) + tasks = [ + ( + id, + args.image_folder, + args.label_folder, + args.dst_image_folder, + args.dst_label_folder, + ) + for id in imgs + ] + if args.nproc > 1: + mmengine.track_parallel_progress(resize_half, tasks, args.nproc) + else: + mmengine.track_progress(resize_half, tasks) + + +if __name__ == "__main__": + main() diff --git a/tools/convert_datasets/urbansyn.py b/tools/convert_datasets/urbansyn.py new file mode 100644 index 0000000..dd2b4ad --- /dev/null +++ b/tools/convert_datasets/urbansyn.py @@ -0,0 +1,29 @@ +from PIL import Image +import os +import argparse + +def replace_pixel_value(directory, target_value, new_value): + for filename in os.listdir(directory): + if filename.endswith(".png"): + file_path = os.path.join(directory, filename) + with Image.open(file_path) as img: + img = img.convert("L") # Ensure the image is in grayscale mode + data = img.getdata() + new_data = [] + for item in data: + if item == target_value: + new_data.append(new_value) + else: + new_data.append(item) + img.putdata(new_data) + img.save(file_path) + print(f"Processed: {file_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Replace pixel values in images within a directory.") + parser.add_argument("directory", help="Directory containing images") + parser.add_argument("--target_value", type=int, default=19, help="Pixel value to be replaced (default: 19)") + parser.add_argument("--new_value", type=int, default=255, help="New pixel value (default: 255)") + + args = parser.parse_args() + replace_pixel_value(args.directory, args.target_value, args.new_value) diff --git a/tools/convert_models/convert_dinov2.py b/tools/convert_models/convert_dinov2.py new file mode 100644 index 0000000..806275b --- /dev/null +++ b/tools/convert_models/convert_dinov2.py @@ -0,0 +1,88 @@ +import torch +import os.path as osp +from collections import OrderedDict +from torch import Tensor +import torch.nn.functional as F +import sys +import numpy as np +import argparse + + +def parse_args(): + args = argparse.ArgumentParser() + args.add_argument("pretrained", type=str) + args.add_argument("converted", type=str) + args.add_argument("--kernel", default=16, type=int) + args.add_argument("--height", default=512, type=int) + args.add_argument("--width", default=512, type=int) + return args.parse_args() + + +def load_weight(pretrained_path): + if not osp.isfile(pretrained_path): + raise FileNotFoundError( + f"{pretrained_path} dont exist(absolute path: {osp.abspath(pretrained_path)})" + ) + weight = torch.load(pretrained_path, map_location="cpu") + if len(weight.keys()) <= 10: + print(f"The read weights may be abnormal, as shown below:") + print(weight.keys()) + raise KeyError() + return weight + + +def interpolate_patch_embed_(weight, key="patch_embed.proj.weight", kernel_conv=16): + assert key in weight, f"{key} must in {weight.keys()}" + ori_shape = weight[key].shape + weight[key] = F.interpolate( + weight[key].float(), + size=(kernel_conv, kernel_conv), + mode="bicubic", + align_corners=False, + ) + dst_shape = weight[key].shape + print(f"Convert conv kernel in patch embed layer: {ori_shape} -> {dst_shape}") + + +def interpolate_pos_embed_( + weight: dict, key="pos_embed", crop_size=(512, 512), kernel_conv=16 +): + pos_cls, pos_tokens = weight[key][:, :1, :], weight["pos_embed"][:, 1:, :] + embed_dim = pos_tokens.shape[-1] + orig_size = int(pos_tokens.shape[-2] ** 0.5) + orig_shape = (-1, orig_size, orig_size, embed_dim) + crop_size = tuple(L // kernel_conv for L in crop_size) + resized_pos_tokens = F.interpolate( + pos_tokens.reshape(*orig_shape).permute(0, 3, 1, 2), + size=crop_size, + mode="bicubic", + align_corners=False, + ) + dst_shape = resized_pos_tokens.shape + resized_pos_tokens = resized_pos_tokens.permute(0, 2, 3, 1).reshape( + -1, np.prod(crop_size), embed_dim + ) + weight[key] = torch.cat((pos_cls, resized_pos_tokens), dim=1) + print( + f"Convert pos embedding: {pos_tokens.shape} -> {orig_shape} -> {dst_shape} -> {resized_pos_tokens.shape}" + ) + + +def main(): + args = parse_args() + pretrained_path = args.pretrained + converted_path = args.converted + kernel_conv = args.kernel + crop_size = (args.height, args.width) + weight = load_weight(pretrained_path) + print("Load from", pretrained_path) + interpolate_patch_embed_(weight, kernel_conv=kernel_conv) + interpolate_pos_embed_(weight, crop_size=crop_size, kernel_conv=kernel_conv) + torch.save(weight, converted_path) + print("Save to", converted_path) + return args + + +# Check if the script is run directly (and not imported) +if __name__ == "__main__": + main() diff --git a/tools/convert_models/convert_eva2_512x512.py b/tools/convert_models/convert_eva2_512x512.py new file mode 100644 index 0000000..9b2c948 --- /dev/null +++ b/tools/convert_models/convert_eva2_512x512.py @@ -0,0 +1,108 @@ +# sourced from EVA02 +import argparse +import torch + + +def interpolate_pos_embed(checkpoint_model, new_size=16): + if "pos_embed" in checkpoint_model: + pos_embed_checkpoint = checkpoint_model["pos_embed"] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = 1024 + num_extra_tokens = 1 + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches**0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print( + "Position interpolate from %dx%d to %dx%d" + % (orig_size, orig_size, new_size, new_size) + ) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape( + -1, orig_size, orig_size, embedding_size + ).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens.float(), + size=(new_size, new_size), + mode="bicubic", + align_corners=False, + ) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model["pos_embed"] = new_pos_embed + if "positional_embedding" in checkpoint_model: + positional_embedding_checkpoint = checkpoint_model["positional_embedding"] + embedding_size = positional_embedding_checkpoint.shape[-1] + num_patches = 1024 + num_extra_tokens = 1 + # height (== width) for the checkpoint position embedding + orig_size = int( + (positional_embedding_checkpoint.shape[-2] - num_extra_tokens) ** 0.5 + ) + # height (== width) for the new position embedding + new_size = int(num_patches**0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print( + "Position interpolate from %dx%d to %dx%d" + % (orig_size, orig_size, new_size, new_size) + ) + extra_tokens = positional_embedding_checkpoint[:num_extra_tokens, :] + # only the position tokens are interpolated + pos_tokens = positional_embedding_checkpoint[num_extra_tokens:, :] + pos_tokens = pos_tokens.reshape( + -1, orig_size, orig_size, embedding_size + ).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens.float(), + size=(new_size, new_size), + mode="bicubic", + align_corners=False, + ) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2).squeeze(0) + new_positional_embedding = torch.cat((extra_tokens, pos_tokens), dim=0) + checkpoint_model["positional_embedding"] = new_positional_embedding + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="interpolate patch_embed kernel") + parser.add_argument( + "input", + default="/path/to/eva_psz14.pt", + type=str, + metavar="PATH", + help="path to input EVA checkpoint with patch_embed kernel_size=14x14", + ) + parser.add_argument( + "output", + default="/path/to/eva_psz14to16.pt", + type=str, + metavar="PATH", + help="path to output EVA checkpoint with patch_embed kernel_size=16x16", + ) + args = parser.parse_args() + + checkpoint = torch.load(args.input, map_location=torch.device("cpu")) + + # interpolate patch_embed + if "model" in checkpoint: + checkpoint = checkpoint["model"] + patch_embed = checkpoint["patch_embed.proj.weight"] + C_o, C_in, H, W = patch_embed.shape + patch_embed = torch.nn.functional.interpolate( + patch_embed.float(), size=(16, 16), mode="bicubic", align_corners=False + ) + checkpoint["patch_embed.proj.weight"] = patch_embed + + # interpolate pos_embed too + interpolate_pos_embed(checkpoint, new_size=32) + + print("======== new state_dict ========") + for k, v in list(checkpoint.items()): + print(k, " ", v.shape) + + torch.save(checkpoint, args.output) diff --git a/tools/convert_models/convert_sam.py b/tools/convert_models/convert_sam.py new file mode 100644 index 0000000..993f88a --- /dev/null +++ b/tools/convert_models/convert_sam.py @@ -0,0 +1,91 @@ +import torch +import os.path as osp +from collections import OrderedDict +from torch import Tensor +import torch.nn.functional as F +import sys +import numpy as np +import argparse + + +def parse_args(): + args = argparse.ArgumentParser() + args.add_argument("pretrained", type=str) + args.add_argument("converted", type=str) + args.add_argument("--kernel", default=16, type=int) + args.add_argument("--height", default=512, type=int) + args.add_argument("--width", default=512, type=int) + return args.parse_args() + + +def select_component(d: dict, k: str): + return {_k.replace(k, ""): v for _k, v in d.items() if k in _k} + + +def load_weight(pretrained_path): + if not osp.isfile(pretrained_path): + raise FileNotFoundError( + f"{pretrained_path} dont exist(absolute path: {osp.abspath(pretrained_path)})" + ) + weight = torch.load(pretrained_path, map_location="cpu") + weight = select_component(weight, "image_encoder.") + if len(weight.keys()) <= 10: + print(f"The read weights may be abnormal, as shown below:") + print(weight.keys()) + raise KeyError() + return weight + + +def interpolate_patch_embed_(weight, key="patch_embed.proj.weight", kernel_conv=16): + assert key in weight, f"{key} must in {weight.keys()}" + ori_shape = weight[key].shape + weight[key] = F.interpolate( + weight[key].float(), + size=(kernel_conv, kernel_conv), + mode="bicubic", + align_corners=False, + ) + dst_shape = weight[key].shape + print(f"Convert conv kernel in patch embed layer: {ori_shape} -> {dst_shape}") + + +def interpolate_pos_embed_( + weight: dict, key="pos_embed", crop_size=(512, 512), kernel_conv=16 +): + pos_tokens = weight[key] + orig_shape = pos_tokens.shape + dst_shape = (orig_shape[0],orig_shape[-1]) + crop_size + embed_dim = orig_shape[-1] + # ... + crop_size = tuple(L // kernel_conv for L in crop_size) + resized_pos_tokens = F.interpolate( + pos_tokens.permute(0, 3, 1, 2), + size=crop_size, + mode="bicubic", + align_corners=False, + ) + resized_pos_tokens = resized_pos_tokens.permute(0, 2, 3, 1) + weight[key] = resized_pos_tokens + print( + f"Convert pos embedding: {pos_tokens.shape} -> {orig_shape} -> {dst_shape} -> {resized_pos_tokens.shape}" + ) + + +def main(): + args = parse_args() + pretrained_path = args.pretrained + converted_path = args.converted + kernel_conv = args.kernel + crop_size = (args.height, args.width) + weight = load_weight(pretrained_path) + print("Load from", pretrained_path) + interpolate_patch_embed_(weight, kernel_conv=kernel_conv) + interpolate_pos_embed_(weight, crop_size=crop_size, kernel_conv=kernel_conv) + torch.save(weight, converted_path) + print("Save to", converted_path) + return args + + +# Check if the script is run directly (and not imported) +if __name__ == "__main__": + main() diff --git a/tools/dist_train.sh b/tools/dist_train.sh new file mode 100644 index 0000000..a857df7 --- /dev/null +++ b/tools/dist_train.sh @@ -0,0 +1,17 @@ +CONFIG=$1 +GPUS=$2 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS \ + --master_port=$PORT \ + $(dirname "$0")/train.py \ + $CONFIG \ + --launcher pytorch ${@:3} diff --git a/tools/eval_l8_scene.py b/tools/eval_l8_scene.py new file mode 100644 index 0000000..88c75ca --- /dev/null +++ b/tools/eval_l8_scene.py @@ -0,0 +1,106 @@ +import argparse +from glob import glob +import os +import numpy as np +from PIL import Image +from mmeval import MeanIoU +import torch +from mmseg.apis import init_model, inference_model +from rich.progress import track + + +def parse_args(): + default_config = "work_dirs/ours_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head_40w/ours_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head_40w.py" + default_weight = "work_dirs/ours_adapter_pmaa_convnext_lora_16_adapter_all_l8_load_head_40w/full_weight.pth" + parser = argparse.ArgumentParser( + description="MMSeg test (and eval) a model") + parser.add_argument( + "--config", help="Path to the training configuration file.", default=default_config) + parser.add_argument( + "--checkpoint", help="Path to the checkpoint file for both the REIN and head models.", default=default_weight) + parser.add_argument( + "--img_dir", help="Path to the directory containing images to be processed.", default="data/l8_biome") + parser.add_argument("--device", default="cuda:1") + + args = parser.parse_args() + return args.config, args.checkpoint, args.img_dir, args.device + + +def get_img_list(img_dir: str): + image_list = glob(os.path.join(img_dir, "img_dir", "test", "*")) + assert len(image_list) > 0, f"{img_dir} is empty" + return image_list + + +def main(): + import cloud_adapter + import cloud_adapter.models + + config, checkpoint, img_dir, device = parse_args() + model = init_model(config, checkpoint, device) + image_list = get_img_list(img_dir) + + scenes_cls = [ + "Grass/Crops", + "Urban", + "Wetlands", + "Snow/Ice", + "Barren", + "Forest", + "Shrubland", + "Water", + ] + scene_mapping = { + "grass":"Grass/Crops", + "urban":"Urban", + "wetlands":"Wetlands", + "forest":"Forest", + "shrubland":"Shrubland", + "snow":"Snow/Ice", + "barren":"Barren", + "water":"Water" + } + scene_metrics = {scene: {} for scene in scenes_cls} + miou = MeanIoU(num_classes=4) + for image_path in track(image_list, total=len(image_list)): + ann_path = image_path.replace("img_dir", "ann_dir") + gt = np.array(Image.open(ann_path)) + gt = gt[np.newaxis] + result = inference_model(model, image_path) + pred_sem_seg: np.ndarray = result.pred_sem_seg.data.cpu().numpy() + result_iou = miou(pred_sem_seg, gt) + scene = os.path.basename(image_path).split("_")[0] + scene = scene_mapping[scene] + if "mIoU" not in scene_metrics[scene]: + scene_metrics[scene]["mIoU"] = [] + scene_metrics[scene]["mIoU"].append(result_iou['mIoU']) + + if "aAcc" not in scene_metrics[scene]: + scene_metrics[scene]["aAcc"] = [] + scene_metrics[scene]["aAcc"].append(result_iou['aAcc']) + + if "mAcc" not in scene_metrics[scene]: + scene_metrics[scene]["mAcc"] = [] + scene_metrics[scene]["mAcc"].append(result_iou['mAcc']) + + if "mDice" not in scene_metrics[scene]: + scene_metrics[scene]["mDice"] = [] + scene_metrics[scene]["mDice"].append(result_iou['mDice']) + + # 计算平均指标 + for scene in scenes_cls: + scene_metrics[scene]["mIoU"] = sum(scene_metrics[scene]["mIoU"]) / len(scene_metrics[scene]["mIoU"]) + scene_metrics[scene]["aAcc"] = sum(scene_metrics[scene]["aAcc"]) / len(scene_metrics[scene]["aAcc"]) + scene_metrics[scene]["mAcc"] = sum(scene_metrics[scene]["mAcc"]) / len(scene_metrics[scene]["mAcc"]) + scene_metrics[scene]["mDice"] = sum(scene_metrics[scene]["mDice"]) / len(scene_metrics[scene]["mDice"]) + + # 将结果保存为json文件 + import json + with open("results.json", "w") as f: + json.dump(scene_metrics, f,indent=4) + + + + +if __name__ == "__main__": + main() diff --git a/tools/generate_full_weights.py b/tools/generate_full_weights.py new file mode 100644 index 0000000..7de9eda --- /dev/null +++ b/tools/generate_full_weights.py @@ -0,0 +1,44 @@ +import torch +import argparse + + +def main(args): + segmentor_save_path = args.segmentor_save_path + backbone_path = args.backbone + head_path = args.head + + # Load weights from the provided paths + head_weights = torch.load(head_path, map_location="cpu") + backbone_weights = torch.load(backbone_path, map_location="cpu") + + # Prefix backbone weights with 'backbone.' + backbone_weights = {f"backbone.{k}": v for k, v in backbone_weights.items()} + + # Update the REIN head weights with the backbone weights + if "state_dict" in head_weights: + head_weights["state_dict"].update(backbone_weights) + else: + head_weights.update(backbone_weights) + + # Save the combined weights to the specified path + torch.save(head_weights, segmentor_save_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Combine pre-trained backbone weights and fine-tuned head weights into a complete set of segmentor weights." + ) + parser.add_argument( + "--segmentor_save_path", + required=True, + help="Path to save the combined segmentor checkpoint", + ) + parser.add_argument( + "--backbone", required=True, help="Path to the pre-trained backbone weights" + ) + parser.add_argument( + "--head", required=True, help="Path to the fine-tuned head weights" + ) + + args = parser.parse_args() + main(args) diff --git a/tools/inference_video.py b/tools/inference_video.py new file mode 100644 index 0000000..faf097d --- /dev/null +++ b/tools/inference_video.py @@ -0,0 +1,141 @@ +from argparse import ArgumentParser + +import cv2 +from mmengine.model.utils import revert_sync_batchnorm + +from mmseg.apis import inference_model, init_model +from mmseg.utils import get_classes, get_palette +import numpy as np +import torch +import tqdm +import cloud_adapter + +classes = get_classes("cityscapes") +palette = get_palette("cityscapes") + + +def draw_sem_seg(sem_seg: torch.Tensor): + num_classes = len(classes) + sem_seg = sem_seg.data.squeeze(0) + H, W = sem_seg.shape + ids = torch.unique(sem_seg).cpu().numpy() + legal_indices = ids < num_classes + ids = ids[legal_indices] + labels = np.array(ids, dtype=np.int64) + colors = [palette[label] for label in labels] + colors = [torch.tensor(color, dtype=torch.uint8).view(1, 1, 3) for color in colors] + result = torch.zeros([H, W, 3], dtype=torch.uint8) + for label, color in zip(labels, colors): + result[sem_seg == label, :] = color + return result.cpu().numpy() + + +def main(): + parser = ArgumentParser() + parser.add_argument("video", help="Video file or webcam id") + parser.add_argument("config", help="Config file") + parser.add_argument("checkpoint", help="Checkpoint file") + parser.add_argument("--device", default="cuda:0", help="Device used for inference") + parser.add_argument( + "--palette", + default="cityscapes", + help="Color palette used for segmentation map", + ) + parser.add_argument( + "--show", action="store_true", help="Whether to show draw result" + ) + parser.add_argument( + "--show-wait-time", default=1, type=int, help="Wait time after imshow" + ) + parser.add_argument( + "--output-file", default=None, type=str, help="Output video file path" + ) + parser.add_argument( + "--output-fourcc", default="MJPG", type=str, help="Fourcc of the output video" + ) + parser.add_argument( + "--output-fps", default=-1, type=int, help="FPS of the output video" + ) + parser.add_argument( + "--output-height", default=-1, type=int, help="Frame height of the output video" + ) + parser.add_argument( + "--output-width", default=-1, type=int, help="Frame width of the output video" + ) + parser.add_argument( + "--opacity", + type=float, + default=0.3, + help="Opacity of painted segmentation map. In (0, 1] range.", + ) + args = parser.parse_args() + + # build the model from a config file and a checkpoint file + model = init_model(args.config, args.checkpoint, device=args.device) + if args.device == "cpu": + model = revert_sync_batchnorm(model) + + # build input video + if args.video.isdigit(): + args.video = int(args.video) + cap = cv2.VideoCapture(args.video) + assert cap.isOpened() + input_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) + input_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) + input_fps = cap.get(cv2.CAP_PROP_FPS) + input_length = cap.get(cv2.CAP_PROP_FRAME_COUNT) + + # init output video + writer_fusion = None + output_height = None + output_width = None + fusion_file = args.video.replace(".mp4", "_fusion.mp4") + segmap_file = args.video.replace(".mp4", "_segmap.mp4") + fourcc = cv2.VideoWriter_fourcc(*args.output_fourcc) + output_fps = args.output_fps if args.output_fps > 0 else input_fps + output_height = args.output_height if args.output_height > 0 else int(input_height) + output_width = args.output_width if args.output_width > 0 else int(input_width) + writer_fusion = cv2.VideoWriter( + fusion_file, fourcc, output_fps, (output_width, output_height), True + ) + writer_segmap = cv2.VideoWriter( + segmap_file, fourcc, output_fps, (output_width, output_height), True + ) + print(writer_fusion) + # start looping + bar = tqdm.tqdm(total=input_length) + try: + while True: + flag, frame = cap.read() + if not flag: + break + bar.update(1) + # test a single image + result = inference_model(model, frame) + + # blend raw image and prediction + pred = draw_sem_seg(result.pred_sem_seg) + draw_img = ( + pred[:, :, ::-1] * (1 - args.opacity) + frame * args.opacity + ).astype(np.uint8) + + if args.show: + cv2.imshow("video_demo", draw_img) + cv2.waitKey(args.show_wait_time) + if writer_fusion and writer_segmap: + if ( + draw_img.shape[0] != output_height + or draw_img.shape[1] != output_width + ): + draw_img = cv2.resize(draw_img, (output_width, output_height)) + writer_fusion.write(draw_img) + writer_segmap.write(pred[:, :, ::-1].astype(np.uint8)) + finally: + if writer_fusion and writer_segmap: + writer_fusion.release() + writer_segmap.release() + cap.release() + + +if __name__ == "__main__": + main() diff --git a/tools/mmseg_vis.py b/tools/mmseg_vis.py new file mode 100644 index 0000000..1a87081 --- /dev/null +++ b/tools/mmseg_vis.py @@ -0,0 +1,105 @@ +from mmseg.apis import MMSegInferencer, init_model, inference_model, show_result_pyplot +import sys +import numpy as np +import argparse +from rich.progress import track +from PIL import Image +import torch +from glob import glob +from typing import List, Tuple +import torchvision +import os +import os.path as osp +os.chdir(osp.abspath(osp.dirname(osp.dirname(__file__)))) + +sys.path.append(os.curdir) + + +def get_args() -> Tuple[str, str, str, str, int]: + parser = argparse.ArgumentParser() + parser.add_argument('--dataset_name', type=str, help='Image file') + parser.add_argument('--config', type=str, help='config path') + parser.add_argument('--checkpoint', type=str, help='checkpoint path') + parser.add_argument('--device', type=str, help='cpu/cuda:0', default="cpu") + args = parser.parse_args() + return args.dataset_name, args.config, args.checkpoint, args.device + + +def get_image_sub_path(dataset_name: str) -> str: + if dataset_name in ["cloudsen12_high_l1c", "cloudsen12_high_l2a", "l8_biome", "hrc_whu"]: + return "test" + return "val" + + +def get_image_list(dataset_name: str) -> List[str]: + # data/cloudsen12_high_l1c/img_dir/test/0.png + image_sub_dir = get_image_sub_path(dataset_name) + image_list = glob(os.path.join("data", dataset_name, + "img_dir", image_sub_dir, "*")) + return image_list + + +def get_classes(dataset_name: str) -> int: + if dataset_name in ["cloudsen12_high_l1c", "cloudsen12_high_l2a"]: + return ["clear", "thick cloud", "thin cloud", "cloud shadow"] + if dataset_name == "l8_biome": + return ["Clear", "Cloud Shadow", "Thin Cloud", "Cloud"] + if dataset_name in ["gf12ms_whu_gf1", "gf12ms_whu_gf2", "hrc_whu"]: + return ['clear sky', 'cloud'] + raise Exception("dataset_name not supported") + + +def get_palette(dataset_name: str) -> List[Tuple[int, int, int]]: + if dataset_name in ["cloudsen12_high_l1c", "cloudsen12_high_l2a"]: + return [79, 253, 199, 77, 2, 115, 251, 255, 41, 221, 53, 223] + if dataset_name == "l8_biome": + return [79, 253, 199, 221, 53, 223, 251, 255, 41, 77, 2, 115] + if dataset_name in ["gf12ms_whu_gf1", "gf12ms_whu_gf2", "hrc_whu"]: + return [79, 253, 199, 77, 2, 115] + raise Exception("dataset_name not supported") + + +def give_colors_to_mask(mask: np.ndarray, colors=None,save_path:str=None) -> np.ndarray: + """将mask转换为彩色 + + + """ + # 使用pillow 的p 模式将Mask进行上色 + + im = Image.fromarray(mask.astype(np.uint8)).convert("P") + + + im.putpalette(colors) + im.save(save_path) + + +def inference(): + import cloud_adapter + import cloud_adapter.models + dataset_name, config, checkpoint, device = get_args() + model = init_model(config, checkpoint, device) + img_list = get_image_list(dataset_name) + colors = get_palette(dataset_name) + os.makedirs(os.path.join("visualization", dataset_name,"cloud-adapter"), exist_ok=True) + os.makedirs(os.path.join("visualization", dataset_name,"label"), exist_ok=True) + os.makedirs(os.path.join("visualization", dataset_name,"input"), exist_ok=True) + for img_path in track(img_list,total=len(img_list)): + + result = inference_model(model, img_path) + ann_path = img_path.replace("img_dir", "ann_dir") + gt = np.array(Image.open(ann_path)) + img = np.array(Image.open(img_path).convert("RGB")) + pred_sem_seg: torch.Tensor = result.pred_sem_seg.data + pred_mask = pred_sem_seg.cpu().squeeze().numpy() + + filename = osp.basename(img_path).split(".")[0] + ".png" + give_colors_to_mask(pred_mask, colors,os.path.join("visualization", dataset_name,"cloud-adapter",filename)) + + give_colors_to_mask(gt,colors,os.path.join("visualization", dataset_name,"label",filename)) + Image.fromarray(img).save(os.path.join("visualization", dataset_name,"input",filename)) + + +if __name__ == '__main__': + # examlr usage:python tools/mmseg_vis.py --dataset_name hrc_whu --config work_dirs/ours_adapter_pmaa_convnext_lora_16_adapter_all_hrc_whu/ours_adapter_pmaa_convnext_lora_16_adapter_all_hrc_whu.py --checkpoint work_dirs/ours_adapter_pmaa_convnext_lora_16_adapter_all_hrc_whu/full_weight.pth --device cuda:3 + # main() + inference() diff --git a/tools/test.py b/tools/test.py new file mode 100644 index 0000000..eb4791a --- /dev/null +++ b/tools/test.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +os.chdir(osp.abspath(osp.dirname(osp.dirname(__file__)))) +import sys + +sys.path.append(os.curdir) + +from mmengine.config import Config, DictAction +from mmengine.runner import Runner +import cloud_adapter.datasets + + +# TODO: support fuse_conv_bn, visualization, and format_only +def parse_args(): + parser = argparse.ArgumentParser(description="MMSeg test (and eval) a model") + parser.add_argument("config", help="train config file path") + parser.add_argument("checkpoint", help="rein and head checkpoint file") + parser.add_argument("--backbone", help="backbone checkpoint file", default="") + parser.add_argument( + "--work-dir", + help=( + "if specified, the evaluation metric results will be dumped" + "into the directory as json" + ), + ) + parser.add_argument( + "--out", + type=str, + help="The directory to save output prediction for offline evaluation", + ) + parser.add_argument("--show", action="store_true", help="show prediction results") + parser.add_argument( + "--show-dir", + help="directory where painted images will be saved. " + "If specified, it will be automatically saved " + "to the work_dir/timestamp/show_dir", + ) + parser.add_argument( + "--wait-time", type=float, default=2, help="the interval of show (s)" + ) + parser.add_argument( + "--cfg-options", + nargs="+", + action=DictAction, + help="override some settings in the used config, the key-value pair " + "in xxx=yyy format will be merged into config file. If the value to " + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + "Note that the quotation marks are necessary and that no white space " + "is allowed.", + ) + parser.add_argument( + "--launcher", + choices=["none", "pytorch", "slurm", "mpi"], + default="none", + help="job launcher", + ) + parser.add_argument("--tta", action="store_true", help="Test time augmentation") + # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` + # will pass the `--local-rank` parameter to `tools/train.py` instead + # of `--local_rank`. + parser.add_argument("--local_rank", "--local-rank", type=int, default=0) + args = parser.parse_args() + if "LOCAL_RANK" not in os.environ: + os.environ["LOCAL_RANK"] = str(args.local_rank) + + return args + + +def trigger_visualization_hook(cfg, args): + default_hooks = cfg.default_hooks + if "visualization" in default_hooks: + visualization_hook = default_hooks["visualization"] + visualization_hook["interval"] = 1 + # Turn on visualization + # visualization_hook['draw'] = True + # if args.show: + # visualization_hook['show'] = True + # visualization_hook['wait_time'] = args.wait_time + if args.show_dir: + visulizer = cfg.visualizer + visulizer["save_dir"] = args.show_dir + else: + raise RuntimeError( + "VisualizationHook must be included in default_hooks." + "refer to usage " + "\"visualization=dict(type='VisualizationHook')\"" + ) + + return cfg + + +def main(): + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get("work_dir", None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join( + "./work_dirs", osp.splitext(osp.basename(args.config))[0] + ) + cfg.work_dir = cfg.work_dir + "_test" + cfg.load_from = args.checkpoint + if args.backbone: + custom_hooks = getattr(cfg, "custom_hooks", []) + custom_hooks.append( + dict(type="LoadBackboneHook", checkpoint_path=args.backbone) + ) + setattr(cfg, "custom_hooks", custom_hooks) + + if args.show or args.show_dir: + cfg = trigger_visualization_hook(cfg, args) + + if args.tta: + cfg.test_dataloader.dataset.pipeline = cfg.tta_pipeline + cfg.tta_model.module = cfg.model + cfg.model = cfg.tta_model + + # add output_dir in metric + if args.out is not None: + cfg.test_evaluator["output_dir"] = args.out + cfg.test_evaluator["keep_results"] = True + + # build the runner from config + runner = Runner.from_cfg(cfg) + + # start testing + runner.test() + + +if __name__ == "__main__": + main() diff --git a/tools/train.py b/tools/train.py new file mode 100644 index 0000000..73d79e3 --- /dev/null +++ b/tools/train.py @@ -0,0 +1,117 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import logging +import os +import os.path as osp + +os.chdir(osp.abspath(osp.dirname(osp.dirname(__file__)))) +import sys + +sys.path.append(os.curdir) +from mmengine.config import Config, DictAction +from mmengine.logging import print_log +from mmengine.runner import Runner + +from mmseg.registry import RUNNERS +import cloud_adapter.datasets +import cloud_adapter +import cloud_adapter.models + +def parse_args(): + parser = argparse.ArgumentParser(description="Train a segmentor") + parser.add_argument("config", help="train config file path") + parser.add_argument("--work-dir", help="the dir to save logs and models") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume from the latest checkpoint in the work_dir automatically", + ) + parser.add_argument( + "--amp", + action="store_true", + default=False, + help="enable automatic-mixed-precision training", + ) + parser.add_argument( + "--cfg-options", + nargs="+", + action=DictAction, + help="override some settings in the used config, the key-value pair " + "in xxx=yyy format will be merged into config file. If the value to " + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + "Note that the quotation marks are necessary and that no white space " + "is allowed.", + ) + parser.add_argument( + "--launcher", + choices=["none", "pytorch", "slurm", "mpi"], + default="none", + help="job launcher", + ) + # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` + # will pass the `--local-rank` parameter to `tools/train.py` instead + # of `--local_rank`. + parser.add_argument("--local_rank", "--local-rank", type=int, default=0) + args = parser.parse_args() + if "LOCAL_RANK" not in os.environ: + os.environ["LOCAL_RANK"] = str(args.local_rank) + + return args + + +def main(): + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get("work_dir", None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join( + "./work_dirs", osp.splitext(osp.basename(args.config))[0] + ) + + # enable automatic-mixed-precision training + if args.amp is True: + optim_wrapper = cfg.optim_wrapper.type + if optim_wrapper == "AmpOptimWrapper": + print_log( + "AMP training is already enabled in your config.", + logger="current", + level=logging.WARNING, + ) + else: + assert optim_wrapper == "OptimWrapper", ( + "`--amp` is only supported when the optimizer wrapper type is " + f"`OptimWrapper` but got {optim_wrapper}." + ) + cfg.optim_wrapper.type = "AmpOptimWrapper" + cfg.optim_wrapper.loss_scale = "dynamic" + + # resume training + cfg.resume = args.resume + + # build the runner from config + if "runner_type" not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # start training + runner.train() + + +if __name__ == "__main__": + main() diff --git a/tools/vis.py b/tools/vis.py new file mode 100644 index 0000000..fc70cd7 --- /dev/null +++ b/tools/vis.py @@ -0,0 +1,22 @@ +import numpy as np +from PIL import Image + +img_root = "data/cloudsen12_high_l2a/img_dir/test" +ann_root = "data/cloudsen12_high_l2a/ann_dir/test" +# palette for 4 classes segmentation mask +palette = [ + [79, 253, 199], + [77, 2, 115], + [251, 255, 41], + [221, 53, 223], +] +for i in range(1, 100): + img = Image.open(f"{img_root}/{i}.png") + ann = Image.open(f"{ann_root}/{i}.png") + ann = np.array(ann) + segmap = np.zeros((ann.shape[0], ann.shape[1], 3), dtype=np.uint8) + for label, color in enumerate(palette): + segmap[ann == label] = color + segmap = Image.fromarray(segmap) + segmap.save(f"vis_img/{i}_vis.png") + img.save(f"vis_ann/{i}_vis.png") \ No newline at end of file diff --git a/tools/visualize.py b/tools/visualize.py new file mode 100644 index 0000000..1bbdbf4 --- /dev/null +++ b/tools/visualize.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +os.chdir(osp.abspath(osp.dirname(osp.dirname(__file__)))) +import sys + +sys.path.append(os.curdir) + +from mmengine.config import Config +from mmseg.utils import get_classes, get_palette +from mmengine.runner.checkpoint import _load_checkpoint +from cloud_adapter.utils import init_model +from mmseg.apis import inference_model +import cloud_adapter +import tqdm +import mmengine +import torch +import numpy as np +from PIL import Image + +def parse_args(): + parser = argparse.ArgumentParser(description="MMSeg test (and eval) a model") + parser.add_argument("config", help="Path to the training configuration file.") + parser.add_argument("checkpoint", help="Path to the checkpoint file for both the REIN and head models.") + parser.add_argument("images", help="Directory or file path of images to be processed.") + parser.add_argument("--suffix", default=".png", help="File suffix to filter images in the directory. Default is '.png'.") + parser.add_argument("--not-recursive", action='store_false', help="Whether to search images recursively in subfolders. Default is recursive.") + parser.add_argument("--search-key", default="", help="Keyword to filter images within the directory. Default is no filtering.") + parser.add_argument( + "--backbone", + default="checkpoints/dinov2_vitl14_converted_1024x1024.pth", + help="Path to the backbone model checkpoint. Default is 'checkpoints/dinov2_vitl14_converted_1024x1024.pth'." + ) + parser.add_argument("--save_dir", default="work_dirs/show", help="Directory to save the output images. Default is 'work_dirs/show'.") + parser.add_argument("--tta", action="store_true", help="Enable test time augmentation. Default is disabled.") + parser.add_argument("--device", default="cuda:0", help="Device to use for computation. Default is 'cuda:0'.") + args = parser.parse_args() + return args + +def load_backbone(checkpoint: dict, backbone_path: str) -> None: + converted_backbone_weight = _load_checkpoint(backbone_path, map_location="cpu") + if "state_dict" in checkpoint: + checkpoint["state_dict"].update( + {f"backbone.{k}": v for k, v in converted_backbone_weight.items()} + ) + else: + checkpoint.update( + {f"backbone.{k}": v for k, v in converted_backbone_weight.items()} + ) + + +classes = get_classes("cityscapes") +palette = get_palette("cityscapes") + + +def draw_sem_seg(sem_seg: torch.Tensor): + num_classes = len(classes) + sem_seg = sem_seg.data.squeeze(0) + H, W = sem_seg.shape + ids = torch.unique(sem_seg).cpu().numpy() + legal_indices = ids < num_classes + ids = ids[legal_indices] + labels = np.array(ids, dtype=np.int64) + colors = [palette[label] for label in labels] + colors = [torch.tensor(color, dtype=torch.uint8).view(1, 1, 3) for color in colors] + result = torch.zeros([H, W, 3], dtype=torch.uint8) + for label, color in zip(labels, colors): + result[sem_seg == label, :] = color + return result.cpu().numpy() + + +def main(): + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + if "test_pipeline" not in cfg: + cfg.test_pipeline = [ + dict(type="LoadImageFromFile"), + dict( + keep_ratio=True, + scale=( + 1920, + 1080, + ), + type="Resize", + ), + dict(type="PackSegInputs"), + ] + model = init_model(cfg, args.checkpoint, device=args.device) + model=model.cuda(args.device) + state_dict = model.state_dict() + load_backbone(state_dict, args.backbone) + model.load_state_dict(state_dict) + mmengine.mkdir_or_exist(args.save_dir) + images = [] + if osp.isfile(args.images): + images.append(args.images) + elif osp.isdir(args.images): + for im in mmengine.scandir(args.images, suffix=args.suffix, recursive=args.not_recursive): + if args.search_key in im: + images.append(osp.join(args.images, im)) + else: + raise NotImplementedError() + print(f"Collect {len(images)} images") + for im_path in tqdm.tqdm(images): + result = inference_model(model, im_path) + pred = draw_sem_seg(result.pred_sem_seg) + img = Image.open(im_path).convert("RGB") + pred = Image.fromarray(pred).resize( + [img.width, img.height], resample=Image.NEAREST + ) + vis = Image.new("RGB", [img.width * 2, img.height]) + vis.paste(img, (0, 0)) + vis.paste(pred, (img.width, 0)) + vis.save(osp.join(args.save_dir, osp.basename(im_path))) + print(f"Results are saved in {args.save_dir}") + + +if __name__ == "__main__": + main() diff --git a/utils/give_colors_to_mask.py b/utils/give_colors_to_mask.py new file mode 100644 index 0000000..ff94312 --- /dev/null +++ b/utils/give_colors_to_mask.py @@ -0,0 +1,102 @@ +import os +import numpy as np +from PIL import Image +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor + +# Define the function to retrieve the color palette for a given dataset +def get_palette(dataset_name: str): + if dataset_name in ["cloudsen12_high_l1c", "cloudsen12_high_l2a"]: + return [79, 253, 199, 77, 2, 115, 251, 255, 41, 221, 53, 223] + if dataset_name == "l8_biome": + return [79, 253, 199, 221, 53, 223, 251, 255, 41, 77, 2, 115] + if dataset_name in ["gf12ms_whu_gf1", "gf12ms_whu_gf2", "hrc_whu"]: + return [79, 253, 199, 77, 2, 115] + raise Exception("dataset_name not supported") + +# Function to apply the color palette to a mask +def give_colors_to_mask(mask: np.ndarray, colors=None) -> np.ndarray: + """Convert a mask to a colorized version using the specified palette.""" + im = Image.fromarray(mask.astype(np.uint8)).convert("P") + im.putpalette(colors) + return im + +# Function to process a single file +def process_file(file_path, palette): + try: + # Load the mask + mask = np.array(Image.open(file_path)) + + # Apply the color palette + colored_mask = give_colors_to_mask(mask, palette) + + # Save the colored mask, overwriting the original file + colored_mask.save(file_path) + return True + except Exception as e: + print(f"Error processing {file_path}: {e}") + return False + +# Main processing function for a dataset +def process_dataset(dataset_name, base_root, progress_bar): + ann_dir = os.path.join(base_root, dataset_name, "ann_dir") + if not os.path.exists(ann_dir): + print(f"Annotation directory does not exist for {dataset_name}: {ann_dir}") + return + + # Get the color palette for this dataset + palette = get_palette(dataset_name) + + # Gather all files to process + files_to_process = [] + for split in ["train", "val", "test"]: + split_dir = os.path.join(ann_dir, split) + if not os.path.exists(split_dir): + print(f"Split directory does not exist for {dataset_name}: {split_dir}") + continue + + # Add all png files in the directory to the list + for file_name in os.listdir(split_dir): + if file_name.endswith(".png"): + files_to_process.append(os.path.join(split_dir, file_name)) + + # Multi-threaded processing + with ThreadPoolExecutor() as executor: + results = list(tqdm( + executor.map(lambda f: process_file(f, palette), files_to_process), + total=len(files_to_process), + desc=f"Processing {dataset_name}", + leave=False + )) + + # Update the progress bar + progress_bar.update(len(files_to_process)) + + print(f"{dataset_name}: Processed {sum(results)} files out of {len(files_to_process)}.") + +# Define the root directory and datasets +base_root = "data" # Replace with your datasets' root directory +dataset_names = [ + "cloudsen12_high_l1c", + "cloudsen12_high_l2a", + "gf12ms_whu_gf1", + "gf12ms_whu_gf2", + "hrc_whu", + "l8_biome" +] + +# Main script +if __name__ == "__main__": + # Calculate total number of files for all datasets + total_files = 0 + for dataset_name in dataset_names: + ann_dir = os.path.join(base_root, dataset_name, "ann_dir") + for split in ["train", "val", "test"]: + split_dir = os.path.join(ann_dir, split) + if os.path.exists(split_dir): + total_files += len([f for f in os.listdir(split_dir) if f.endswith(".png")]) + + # Create a progress bar + with tqdm(total=total_files, desc="Overall Progress") as progress_bar: + for dataset_name in dataset_names: + process_dataset(dataset_name, base_root, progress_bar) diff --git a/utils/stretch.py b/utils/stretch.py new file mode 100644 index 0000000..302fd73 --- /dev/null +++ b/utils/stretch.py @@ -0,0 +1,107 @@ +# -*- encoding: utf-8 -*- +''' +@File : stretch.py +@Time : 2024/08/11 01:35:36 +@Author : XavierJiezou +@Version : 1.0 +@Contact : xuechaozou@foxmail.com +@Citation: https://www.nv5geospatialsoftware.com/docs/BackgroundStretchTypes.html +''' + +import numpy as np +import matplotlib.pyplot as plt + + +def linear_stretch(image): + min_val = np.min(image) + max_val = np.max(image) + stretched = (image - min_val) * 255.0 / (max_val - min_val) + return np.clip(stretched, 0, 255).astype(np.uint8) + +def linear_percent_stretch(image, percent=2): + low, high = np.percentile(image, (percent, 100 - percent)) + stretched = (image - low) * 255.0 / (high - low) + return np.clip(stretched, 0, 255).astype(np.uint8) + +def equalization_stretch(image): + hist, bins = np.histogram(image.flatten(), 256, [0, 256]) + cdf = hist.cumsum() + cdf_normalized = cdf * 255 / cdf[-1] + stretched = np.interp(image.flatten(), bins[:-1], cdf_normalized) + return stretched.reshape(image.shape).astype(np.uint8) + +def gaussian_stretch(image): + mean = np.mean(image) + std_dev = np.std(image) + stretched = 127 + (image - mean) * (128.0 / (3 * std_dev + 1e-6)) # 1e-6 to avoid division by zero + return np.clip(stretched, 0, 255).astype(np.uint8) + +def square_root_stretch(image): + stretched = np.sqrt(image) * np.sqrt(255.0 / np.max(image)) + return np.clip(stretched, 0, 255).astype(np.uint8) + +def logarithmic_stretch(image): + stretched = np.log1p(image) * (255.0 / np.log1p(np.max(image))) + return np.clip(stretched, 0, 255).astype(np.uint8) + +def optimized_linear_stretch(image, min_percent=0.025, max_percent=0.99, min_adjust_percent=0.1, max_adjust_percent=0.5): + cdf, bins = np.histogram(image.flatten(), 256, [0, 256]) + cdf = cdf.cumsum() + cdf_normalized = cdf / cdf[-1] + + min_value = np.searchsorted(cdf_normalized, min_percent) + max_value = np.searchsorted(cdf_normalized, max_percent) + + a = bins[min_value] + b = bins[max_value] + + c = a - min_adjust_percent * (b - a) + d = b + max_adjust_percent * (b - a) + + stretched = (image - c) * 255.0 / (d - c) + return np.clip(stretched, 0, 255).astype(np.uint8) + +def main(image_path): + # 加载一个示例图像 + import tifffile as tiff + image = tiff.imread(image_path) + # image = (image - image.min()) / (image.max() - image.min()) + image = (image/65535.* 255.).astype(np.uint8) + + # 应用不同的拉伸算法 + linear_image = linear_stretch(image) + percent_image = linear_percent_stretch(image) + equalized_image = equalization_stretch(image) + gaussian_image = gaussian_stretch(image) + sqrt_image = square_root_stretch(image) + log_image = logarithmic_stretch(image) + optimized_image = optimized_linear_stretch(image) + + # 展示结果 + fig, axs = plt.subplots(2, 4, figsize=(12, 6)) + axs[0, 0].imshow(image, cmap='gray') + axs[0, 0].set_title("Original Image") + axs[0, 1].imshow(linear_image, cmap='gray') + axs[0, 1].set_title("Linear Stretch") + axs[0, 2].imshow(percent_image, cmap='gray') + axs[0, 2].set_title("Linear Percent Stretch") + axs[0, 3].imshow(equalized_image, cmap='gray') + axs[0, 3].set_title("Equalization Stretch") + axs[1, 0].imshow(gaussian_image, cmap='gray') + axs[1, 0].set_title("Gaussian Stretch") + axs[1, 1].imshow(sqrt_image, cmap='gray') + axs[1, 1].set_title("Square Root Stretch") + axs[1, 2].imshow(log_image, cmap='gray') + axs[1, 2].set_title("Logarithmic Stretch") + axs[1, 3].imshow(optimized_image, cmap='gray') + axs[1, 3].set_title("Optimized Linear Stretch") + + for ax in axs.flat: + ax.axis('off') + + plt.tight_layout() + plt.savefig("images/s2_rgb_int16_stretch.jpg", bbox_inches="tight", pad_inches=0) + plt.show() + +if __name__ == "__main__": + main("images/s2_rgb_int16.tif") diff --git a/utils/upload_zip_to_hub.py b/utils/upload_zip_to_hub.py new file mode 100644 index 0000000..bd0aee7 --- /dev/null +++ b/utils/upload_zip_to_hub.py @@ -0,0 +1,67 @@ +import os +import zipfile +from huggingface_hub import HfApi, HfFolder + +# Define the root directory containing all datasets +base_root = "data" # Replace with the directory containing all datasets +dataset_repo = "XavierJiezou/Cloud-Adapter" # Hugging Face repository name +dataset_names = [ + "hrc_whu", + "gf12ms_whu_gf1", + "gf12ms_whu_gf2", + "cloudsen12_high_l1c", + "cloudsen12_high_l2a", + "l8_biome", +] + +# Function to create a ZIP file for a dataset directory +def create_zip(dataset_path, output_path): + """ + Compress a dataset directory into a ZIP file. + Args: + dataset_path (str): Path to the dataset directory. + output_path (str): Path to save the ZIP file. + """ + with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf: + for root, _, files in os.walk(dataset_path): + for file in files: + file_path = os.path.join(root, file) + arcname = os.path.relpath(file_path, dataset_path) + zipf.write(file_path, arcname) + print(f"Compressed {dataset_path} into {output_path}") + +# Function to upload ZIP files to Hugging Face Hub +def upload_zip_to_hub(dataset_name, zip_path, repo_name): + """ + Upload a ZIP file to a Hugging Face repository. + Args: + dataset_name (str): Name of the dataset (used as a file identifier). + zip_path (str): Path to the ZIP file. + repo_name (str): Hugging Face repository name. + """ + api = HfApi() + token = HfFolder.get_token() + file_name = f"{dataset_name}.zip" + api.upload_file( + path_or_fileobj=zip_path, + path_in_repo=file_name, + repo_id=repo_name, + repo_type="dataset", + token=token, + ) + print(f"Uploaded {file_name} to {repo_name}") + +# Main script +if __name__ == "__main__": + for dataset_name in dataset_names: + dataset_path = os.path.join(base_root, dataset_name) + if not os.path.exists(dataset_path): + print(f"Dataset directory does not exist: {dataset_path}") + continue + + # Create ZIP file + zip_path = f"{dataset_name}.zip" + create_zip(dataset_path, zip_path) + + # Upload ZIP file to Hugging Face Hub + upload_zip_to_hub(dataset_name, zip_path, dataset_repo)