diff --git a/.github/issue_stale.yaml b/.github/issue_stale.yaml
deleted file mode 100644
index 17948d3..0000000
--- a/.github/issue_stale.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Number of days of inactivity before an issue becomes stale
-daysUntilStale: 7
-# Number of days of inactivity before a stale issue is closed
-daysUntilClose: 7
-# Issues with these labels will never be considered stale
-exemptLabels:
- - backlog
-# Label to use when marking an issue as stale
-staleLabel: stale
-# Comment to post when marking an issue as stale. Set to `false` to disable
-markComment: >
- This issue has been automatically marked as stale because it has not had
- recent activity. It will be closed if no further activity occurs. Thank you
- for your contributions.
-# Comment to post when closing a stale issue. Set to `false` to disable
-closeComment: false
\ No newline at end of file
diff --git a/.github/workflow/issue_stale.yaml b/.github/workflow/issue_stale.yaml
new file mode 100644
index 0000000..7c11c2b
--- /dev/null
+++ b/.github/workflow/issue_stale.yaml
@@ -0,0 +1,30 @@
+name: Close inactive issues
+on:
+ schedule:
+ - cron: "35 11 * * 5"
+
+env:
+ DAYS_BEFORE_ISSUE_STALE: 30
+ DAYS_BEFORE_ISSUE_CLOSE: 14
+
+jobs:
+ close-issues:
+ runs-on: ubuntu-latest
+ permissions:
+ issues: write
+ pull-requests: write
+ steps:
+ - uses: actions/stale@v5
+ with:
+ days-before-issue-stale: ${{ env.DAYS_BEFORE_ISSUE_STALE }}
+ days-before-issue-close: ${{ env.DAYS_BEFORE_ISSUE_CLOSE }}
+ stale-issue-label: "stale"
+ stale-issue-message: |
+ This issue is stale because it has been open for ${{ env.DAYS_BEFORE_ISSUE_STALE }} days with no activity.
+ It will be closed if no further activity occurs. Let us know if you still need help!
+ close-issue-message: |
+ This issue is being closed because it has been stale for ${{ env.DAYS_BEFORE_ISSUE_CLOSE }} days with no activity.
+ If you still need help, please feel free to leave comments.
+ days-before-pr-stale: -1
+ days-before-pr-close: -1
+ repo-token: ${{ secrets.GITHUB_TOKEN }}
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..0bd27a2
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "OpenSceneFlow"]
+ path = OpenSceneFlow
+ url = https://github.com/KTH-RPL/OpenSceneFlow.git
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index b80f234..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,43 +0,0 @@
-# check more: https://hub.docker.com/r/nvidia/cuda
-FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
-ENV DEBIAN_FRONTEND noninteractive
-
-RUN apt update && apt install -y --no-install-recommends \
- git curl vim rsync htop
-
-RUN curl -o ~/miniconda.sh -LO https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
- chmod +x ~/miniconda.sh && \
- ~/miniconda.sh -b -p /opt/conda && \
- rm ~/miniconda.sh && \
- /opt/conda/bin/conda clean -ya && /opt/conda/bin/conda init bash
-
-RUN curl -o ~/mamba.sh -LO https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \
- chmod +x ~/mamba.sh && \
- ~/mamba.sh -b -p /opt/mambaforge && \
- rm ~/mamba.sh && /opt/mambaforge/bin/mamba init bash
-
-# install zsh and oh-my-zsh
-RUN apt install -y wget git zsh tmux vim g++
-RUN sh -c "$(wget -O- https://github.com/deluan/zsh-in-docker/releases/download/v1.1.5/zsh-in-docker.sh)" -- \
- -t robbyrussell -p git \
- -p https://github.com/agkozak/zsh-z \
- -p https://github.com/zsh-users/zsh-autosuggestions \
- -p https://github.com/zsh-users/zsh-completions \
- -p https://github.com/zsh-users/zsh-syntax-highlighting
-
-RUN printf "y\ny\ny\n\n" | bash -c "$(curl -fsSL https://raw.githubusercontent.com/Kin-Zhang/Kin-Zhang/main/scripts/setup_ohmyzsh.sh)"
-RUN /opt/conda/bin/conda init zsh && /opt/mambaforge/bin/mamba init zsh
-
-# change to conda env
-ENV PATH /opt/conda/bin:$PATH
-ENV PATH /opt/mambaforge/bin:$PATH
-
-RUN mkdir -p /home/kin/workspace && cd /home/kin/workspace && git clone https://github.com/KTH-RPL/SeFlow.git
-WORKDIR /home/kin/workspace/SeFlow
-RUN apt-get update && apt-get install libgl1 -y
-# need read the gpu device info to compile the cuda extension
-RUN cd /home/kin/workspace/SeFlow && /opt/mambaforge/bin/mamba env create -f environment.yaml
-RUN cd /home/kin/workspace/SeFlow/assets/cuda/mmcv && /opt/mambaforge/envs/seflow/bin/python ./setup.py install
-RUN cd /home/kin/workspace/SeFlow/assets/cuda/chamfer3D && /opt/mambaforge/envs/seflow/bin/python ./setup.py install
-
-
diff --git a/OpenSceneFlow b/OpenSceneFlow
new file mode 160000
index 0000000..6226b07
--- /dev/null
+++ b/OpenSceneFlow
@@ -0,0 +1 @@
+Subproject commit 6226b07784dbd6f1aa52931d419a7d3707b4e3d3
diff --git a/README.md b/README.md
index d2fa38d..49fd33a 100644
--- a/README.md
+++ b/README.md
@@ -8,70 +8,36 @@ SeFlow: A Self-Supervised Scene Flow Method in Autonomous Driving

-2024/11/18 16:17: Update model and demo data download link through HuggingFace, Personally I found `wget` from HuggingFace link is much faster than Zenodo.
-
-2024/09/26 16:24: All codes already uploaded and tested. You can to try training directly by downloading (through [HuggingFace](https://huggingface.co/kin-zhang/OpenSceneFlow)/[Zenodo](https://zenodo.org/records/13744999)) demo data or pretrained weight for evaluation.
-
-Pre-trained weights for models are available in [Zenodo](https://zenodo.org/records/13744999)/[HuggingFace](https://huggingface.co/kin-zhang/OpenSceneFlow) link. Check usage in [2. Evaluation](#2-evaluation) or [3. Visualization](#3-visualization).
-
Task: __Self-Supervised__ Scene Flow Estimation in Autonomous Driving. No human-label needed. Real-time inference (15-20Hz in RTX3090).
-We directly follow our previous work [code structure](https://github.com/KTH-RPL/DeFlow), so you may want to start from the easier one with supervised learning first: Try [DeFlow](https://github.com/KTH-RPL/DeFlow). Then you will find this is simple to you (things about how to train under self-supervised). Here are **Scripts** quick view in this repo:
-
-- `dataprocess/extract_*.py` : pre-process data before training to speed up the whole training time.
- [Dataset we included now: Argoverse 2 and Waymo. more on the way: Nuscenes, custom data.]
-
-- `process.py`: process data with save dufomap, cluster labels inside file. Only needed once for training.
-
-- `train.py`: Train the model and get model checkpoints. Pls remember to check the config.
-
-- `eval.py` : Evaluate the model on the validation/test set. And also output the zip file to upload to online leaderboard.
-
-- `save.py` : Will save result into h5py file, using [tool/visualization.py] to show results with interactive window.
-
- 🎁 One repository, All methods!
-
-You can try following methods in our code without any effort to make your own benchmark.
+📜 2025/02/18: Merging all scene flow code to [OpenSceneFLow codebase](https://github.com/KTH-RPL/OpenSceneFlow) for afterward code maintenance. This repo saved README, [cluster slurm files](assets/slurm), and [quick core file](lossfunc.py) in SeFlow. The old source code branch is also [available here](https://github.com/KTH-RPL/SeFlow/tree/source).
-- [x] [SeFlow](https://arxiv.org/abs/2407.01702) (Ours 🚀): ECCV 2024
-- [x] [DeFlow](https://arxiv.org/abs/2401.16122) (Ours 🚀): ICRA 2024
-- [x] [FastFlow3d](https://arxiv.org/abs/2103.01306): RA-L 2021
-- [x] [ZeroFlow](https://arxiv.org/abs/2305.10424): ICLR 2024, their pre-trained weight can covert into our format easily through [the script](tools/zerof2ours.py).
-- [ ] [NSFP](https://arxiv.org/abs/2111.01253): NeurIPS 2021, faster 3x than original version because of [our CUDA speed up](assets/cuda/README.md), same (slightly better) performance. Done coding, public after review.
-- [ ] [FastNSF](https://arxiv.org/abs/2304.09121): ICCV 2023. Done coding, public after review.
-
-- [ ] ... more on the way
+2024/11/18 16:17: Update model and demo data download link through HuggingFace, Personally I found `wget` from HuggingFace link is much faster than Zenodo.
-
+2024/09/26 16:24: All codes already uploaded and tested. You can to try training directly by downloading (through [HuggingFace](https://huggingface.co/kin-zhang/OpenSceneFlow)/[Zenodo](https://zenodo.org/records/13744999)) demo data or pretrained weight for evaluation.
-💡: Want to learn how to add your own network in this structure? Check [Contribute](assets/README.md#contribute) section and know more about the code. Fee free to pull request!
+Pre-trained weights for models are available in [Zenodo](https://zenodo.org/records/13744999)/[HuggingFace](https://huggingface.co/kin-zhang/OpenSceneFlow) link. Check usage in [2. Evaluation](#2-evaluation) or [3. Visualization](#3-visualization).
## 0. Setup
-**Environment**: Same to [DeFlow](https://github.com/KTH-RPL/DeFlow). And even lighter here with extracting mmcv module we needed into cuda assets.
+**Environment**: Clone the repo and build the environment, check [detail installation](https://github.com/KTH-RPL/OpenSceneFlow/assets/README.md) for more information. [Conda](https://docs.conda.io/projects/miniconda/en/latest/)/[Mamba](https://github.com/mamba-org/mamba) is recommended.
+
```bash
-git clone --recursive https://github.com/KTH-RPL/SeFlow.git
-cd SeFlow && mamba env create -f environment.yaml
+git clone --recursive https://github.com/KTH-RPL/OpenSceneFlow.git
+cd OpenSceneFlow
+mamba env create -f environment.yaml
```
CUDA package (need install nvcc compiler), the compile time is around 1-5 minutes:
```bash
-mamba activate seflow
+mamba activate opensf
# CUDA already install in python environment. I also tested others version like 11.3, 11.4, 11.7, 11.8 all works
cd assets/cuda/mmcv && python ./setup.py install && cd ../../..
cd assets/cuda/chamfer3D && python ./setup.py install && cd ../../..
```
-Or you always can choose [Docker](https://en.wikipedia.org/wiki/Docker_(software)) which isolated environment and free yourself from installation, you can pull it by.
-If you have different arch, please build it by yourself `cd SeFlow && docker build -t zhangkin/seflow` by going through [build-docker-image](https://github.com/KTH-RPL/DeFlow/blob/main/assets/README.md/#build-docker-image) section.
-```bash
-# option 1: pull from docker hub
-docker pull zhangkin/seflow
-
-# run container
-docker run -it --gpus all -v /dev/shm:/dev/shm -v /home/kin/data:/home/kin/data --name seflow zhangkin/seflow /bin/zsh
-```
+Or another environment setup choice is [Docker](https://en.wikipedia.org/wiki/Docker_(software)) which isolated environment, check more information in [OpenSceneFlow/assets/README.md](https://github.com/KTH-RPL/OpenSceneFlow/assets/README.md#docker-environment).
## 1. Run & Train
@@ -87,16 +53,6 @@ wget https://huggingface.co/kin-zhang/OpenSceneFlow/resolve/main/demo_data.zip
unzip demo_data.zip -p /home/kin/data/av2
```
-#### Prepare raw data
-
-Checking more information (step for downloading raw data, storage size, #frame etc) in [dataprocess/README.md](dataprocess/README.md). Extract all data to unified `.h5` format.
-[Runtime: Normally need 45 mins finished run following commands totally in setup mentioned in our paper]
-```bash
-python dataprocess/extract_av2.py --av2_type sensor --data_mode train --argo_dir /home/kin/data/av2 --output_dir /home/kin/data/av2/preprocess_v2
-python dataprocess/extract_av2.py --av2_type sensor --data_mode val --mask_dir /home/kin/data/av2/3d_scene_flow
-python dataprocess/extract_av2.py --av2_type sensor --data_mode test --mask_dir /home/kin/data/av2/3d_scene_flow
-```
-
#### Process train data
Process train data for self-supervised learning. Only training data needs this step. [Runtime: Normally need 15 hours for my desktop, 3 hours for the cluster with five available nodes parallel running.]
diff --git a/assets/README.md b/assets/README.md
deleted file mode 100644
index 07cc3cd..0000000
--- a/assets/README.md
+++ /dev/null
@@ -1,78 +0,0 @@
-SeFlow Assets
----
-
-There are two ways to setup the environment: conda in your desktop and docker container isolate environment.
-
-## Docker
-
-Docker installation check [DeFlow Assets](https://github.com/KTH-RPL/DeFlow/blob/main/assets/README.md#docker-environment). Then you can build and run the container by:
-
-```bash
-cd SeFlow
-docker build -t zhangkin/seflow .
-
-docker run -it --gpus all -v /dev/shm:/dev/shm -v /home/kin/data:/home/kin/data --name seflow zhangkin/seflow /bin/zsh
-```
-
-## Installation
-
-We will use conda to manage the environment with mamba for faster package installation.
-
-### System
-Install conda with mamba for package management and for faster package installation:
-```bash
-curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
-bash Miniforge3-$(uname)-$(uname -m).sh
-```
-
-### Environment
-
-Create base env: [~5 mins]
-
-```bash
-git clone https://github.com/KTH-RPL/SeFlow.git
-mamba env create -f assets/environment.yml
-```
-
-CUDA package (nvcc compiler already installed through conda), the compile time is around 1-5 minutes:
-```bash
-mamba activate seflow
-cd assets/cuda/mmcv && python ./setup.py install && cd ../../..
-cd assets/cuda/chamfer3D && python ./setup.py install && cd ../../..
-```
-
-
-Checking important packages in our environment now:
-```bash
-mamba activate seflow
-python -c "import torch; print(torch.__version__); print(torch.cuda.is_available()); print(torch.version.cuda)"
-python -c "import lightning.pytorch as pl; print(pl.__version__)"
-python -c "from assets.cuda.mmcv import Voxelization, DynamicScatter;print('successfully import on our lite mmcv package')"
-python -c "from assets.cuda.chamfer3D import nnChamferDis;print('successfully import on our chamfer3D package')"
-```
-
-
-### Other issues
-
-1. looks like open3d and fire package conflict, not sure
- - need install package like `pip install --ignore-installed`, ref: [pip cannot install distutils installed project](https://stackoverflow.com/questions/53807511/pip-cannot-uninstall-package-it-is-a-distutils-installed-project), my error: `ERROR: Cannot uninstall 'blinker'.`
- - need specific werkzeug version for open3d 0.16.0, otherwise error: `ImportError: cannot import name 'url_quote' from 'werkzeug.urls'`. But need update to solve the problem: `pip install --upgrade Flask` [ref](https://stackoverflow.com/questions/77213053/why-did-flask-start-failing-with-importerror-cannot-import-name-url-quote-fr)
-
-
-2. `ImportError: libtorch_cuda.so: undefined symbol: cudaGraphInstantiateWithFlags, version libcudart.so.11.0`
- The cuda version: `pytorch::pytorch-cuda` and `nvidia::cudatoolkit` need be same. [Reference link](https://github.com/pytorch/pytorch/issues/90673#issuecomment-1563799299)
-
-
-3. In cluster have error: `pandas ImportError: /lib64/libstdc++.so.6: version 'GLIBCXX_3.4.29' not found`
- Solved by `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/proj/berzelius-2023-154/users/x_qinzh/mambaforge/lib`
-
-
-## Contribute
-
-If you want to contribute to new model, here are tips you can follow:
-1. Dataloader: we believe all data could be process to `.h5`, we named as different scene and inside a scene, the key of each data is timestamp. Check [dataprocess/README.md](../dataprocess/README.md#process) for more details.
-2. Model: All model files can be found [here: src/models](../src/models). You can view deflow and fastflow3d to know how to implement a new model. Don't forget to add to the `__init__.py` [file to import class](../src/models/__init__.py).
-3. Loss: All loss files can be found [here: src/lossfuncs.py](../src/lossfuncs.py). There are three loss functions already inside the file, you can add a new one following the same pattern.
-4. Training: Once you have implemented the model, you can add the model to the config file [here: conf/model](../conf/model) and train the model using the command `python train.py model=your_model_name`. One more note here may: if your res_dict from model output is different, you may need add one pattern in `def training_step` and `def validation_step`.
-
-All others like eval and vis will be changed according to the model you implemented as you follow the above steps.
\ No newline at end of file
diff --git a/assets/cuda/README.md b/assets/cuda/README.md
deleted file mode 100644
index 621bcd3..0000000
--- a/assets/cuda/README.md
+++ /dev/null
@@ -1,21 +0,0 @@
-My CUDA library
----
-
-Faster our code in CUDA.
-
-- chamfer3D: 3D chamfer distance within two point cloud, by Qingwen Zhang involved when she was working on SeFlow.
-- mmcv: directly from mmcv, not our code.
-
----
-
-Quick View about CUDA speed on our chamfer3D (Faster 60x than others):
-
-The number of points: (pc0: 88132, pc1: 88101)
-
-| Function | Time (ms) |
-| :---: | :---: |
-| Faiss | 817.698 |
-| CUDA([SCOOP](https://github.com/itailang/SCOOP/tree/master/auxiliary/ChamferDistancePytorch), Batch) | 83.275 |
-| Pytorch3D | 68.256 |
-| CUDA([SeFlow](https://github.com/KTH-RPL/SeFlow), SharedM) | **14.308** |
-| ~~mmcv~~(chamfer2D) | 651.510 |
\ No newline at end of file
diff --git a/assets/cuda/chamfer3D/README.md b/assets/cuda/chamfer3D/README.md
deleted file mode 100644
index abf8274..0000000
--- a/assets/cuda/chamfer3D/README.md
+++ /dev/null
@@ -1,155 +0,0 @@
-CUDA with Torch 初步尝试
----
-
-主要参考都是mmcv里的库 and [SCOOP](https://github.com/itailang/SCOOP/blob/master/auxiliary/ChamferDistancePytorch/chamfer3D/),只是想着仅提取这一个功能试一下,然后发现几个有意思的点
-CUDA with Torch C++ Programming, [torch official ref link](https://pytorch.org/tutorials/advanced/cpp_extension.html)
-
-1. `.cu` 不能被`.cpp` include,否则失去cu属性
-2. `.cpp`必须命名为`xxx_cuda.cpp` 否则torch CUDAextension不会找
-3. `.cu` 的include和平常的cuda编程有所不同:必须先include ATen然后再正常的导入 CUDA等库 [ref link](https://blog.csdn.net/weixin_39849839/article/details/125980694)
- ```cpp
- #include
-
- #include
- #include
- ```
-
-注意要自己多写一个lib_xxx.py 内含class以方便调用,但是class内的forward函数必须有 `ctx` 参数,否则会报错
-
-
-## Install
-```bash
-# change it if you use different cuda version
-export PATH=/usr/local/cuda-11.3/bin:$PATH
-export LD_LIBRARY_PATH=/usr/local/cuda-11.3/lib64:$LD_LIBRARY_PATH
-
-cd assets/cuda/chamfer3Dlib
-python ./setup.py install
-
-# then you will see
-Installed /home/kin/mambaforge/envs/seflow/lib/python3.8/site-packages/chamfer3D-1.0.0-py3.8-linux-x86_64.egg
-Processing dependencies for chamfer3D==1.0.0
-Finished processing dependencies for chamfer3D==1.0.0
-
-# then run with lib_voxelize.py to see if it works
-python ../chamfer_cuda.py
-```
-
-## ChamferDis Speed
-
-The number of points: (pc0: 88132, pc1: 88101)
-
-| Function | Time (ms) |
-| :---: | :---: |
-| Faiss | 817.698 |
-| CUDA([SCOOP](https://github.com/itailang/SCOOP/tree/master/auxiliary/ChamferDistancePytorch), Batch) | 83.275 |
-| Pytorch3D | 68.256 |
-| CUDA([SeFlow](https://github.com/KTH-RPL/SeFlow), SharedM) | **14.308** |
-| ~~mmcv~~(chamfer2D) | 651.510 |
-
-对比命令行:
-
-```bash
-cd assets/tests
-python chamferdis_speed_test.py
-```
-
-
-Test computer and System:
-- Desktop setting: i9-12900KF, GPU 3090, CUDA 11.3
-- System setting: Ubuntu 20.04, Python 3.8
-
-Output Example:
-```
-Output in my desktop with a 3090 GPU:
------- START Faiss Chamfer Distance Cal ------
-loss: tensor(0.1710, device='cuda:0')
-Faiss Chamfer Distance Cal time: 809.593 ms
-
------- START Pytorch3d Chamfer Distance Cal ------
-Pytorch3d Chamfer Distance Cal time: 68.906 ms
-loss: tensor(0.1710, device='cuda:0', grad_fn=)
-
------- START CUDA Chamfer Distance Cal ------
-Chamfer Distance Cal time: 1.814 ms
-loss: tensor(0.1710, device='cuda:0', grad_fn=)
-```
-
-## Mics
-
-
-### Note for CUDA ChamferDis
-
-主要是 两个月前写的 已经看不懂了;然后问题原因是因为 总是缺0.0003的精度(精度强迫症患者)
-然后就以为是自己写错了 后面发现是因为block的这种并行化 线程大小的不同对CUDA的浮点运算会有所不同,所以导致精度差距是有一点的 如果介意的话 可以使用pytorch3d的版本(也就是速度慢4倍左右 从15ms 到 80ms)
-
-这里主要重申一遍 shared memory在这里的用法:
-1. 首先我们每个点都会分开走到 `int tid = blockIdx.x * blockDim.x + threadIdx.x;` 也就是全局索引,注意这个每个点都分开了 因为pc0每个点和pc1的临近点 和 其他的pc0点无关
-2. 然后走到每个点内部 就是__shared__ 我们首先建立了 pc1的share,但是因为共享内存有限,所以每次只保存THREADS_PER_BLOCK
-3. 保存 THREADS_PER_BLOCK 也是每个线程做的 我们在对比距离前 运行了 __syncthreads(); 确保 THREADS_PER_BLOCK 个点的 pc1 已经到了
-4. 接着 我们在 `num_elems` 这一部分的数据内进行对比,同步best
-5. 最后传给 全局这个点的 `result`
-
-需要注意的是 这种极致的并行化 会对精度产生一定的影响,但是如果你感兴趣 `#define THREADS_PER_BLOCK 256` 可以调整这个,对每个block设置不同的threads 会对精度有影响(当然 影响是 在 gt: 0.1710 但cuda计算会是 0.1711 - 0.1713之间)
-
-以下为chatgpt:
-精度差异的原因之一可能是由于在不同的线程块大小下,浮点运算的顺序发生了改变。由于浮点运算是不结合的(即(a + b) + c 可能不等于 a + (b + c)),因此改变运算的顺序可能会导致轻微的结果差异。
-
-这种类型的精度变化在GPU计算中是非常常见的,特别是在使用较大的数据集和进行大量的浮点运算时。要完全消除这种差异是非常困难的,因为即使是非常微小的实现细节变化(例如改变线程块大小、更改循环的结构、甚至是不同的GPU硬件或不同的CUDA版本)都可能导致浮点运算顺序的微小变化。
-
-如果需要确保结果的一致性,可以考虑以下方法:
-
-1. 固定线程块大小:选择一个固定的线程块大小,并始终使用它。
-
-2. 双精度浮点数(Double Precision):使用double类型代替float,可以提高精度,但代价是更高的内存使用和可能的性能下降。
-
-3. 数值稳定的算法:尽量使用数值稳定的算法,尽管这在GPU上实现起来可能比较复杂且效率较低。
-
-4. 减少并行化程度:通过减少并行化程度来减少由于不同线程执行顺序引起的差异,但这通常会牺牲性能。
-
-
-复制代码部分如下:
-```cpp
-
-for (int i = 0; i < pc1_n; i += THREADS_PER_BLOCK) {
- // Copy a block of pc1 to shared memory
- int pc1_idx = i + threadIdx.x;
- if (pc1_idx < pc1_n) {
- shared_pc1[threadIdx.x * 3 + 0] = pc1_xyz[pc1_idx * 3 + 0];
- shared_pc1[threadIdx.x * 3 + 1] = pc1_xyz[pc1_idx * 3 + 1];
- shared_pc1[threadIdx.x * 3 + 2] = pc1_xyz[pc1_idx * 3 + 2];
- }
-
- __syncthreads();
-
- // Compute the distance between pc0[tid] and the points in shared_pc1
- // NOTE(Qingwen): since after two months I forgot what I did here, I write some notes for future me
- // 0. One reason for the difference in precision may be due to the changing order of floating point operations at different thread block sizes.
- // But I think it's fine we lose 0.0001 precision for speed up cal time 4x
- // 1. since we use shared to store pc1, here Every BLOCK will have new shared_pc1 start from 0
- // 2. we use THREADS_PER_BLOCK to loop pc1, so we need to check if the last block is not full
- // 3. Based on the CUDA document, the __syncthreads() is not necessary here, but we keep it for safety
- // 4. After running once, we go for next block of pc1, and find the best in that batch
-
- int num_elems = min(THREADS_PER_BLOCK, pc1_n - i);
- for (int j = 0; j < num_elems; j++) {
- float x1 = shared_pc1[j * 3 + 0];
- float y1 = shared_pc1[j * 3 + 1];
- float z1 = shared_pc1[j * 3 + 2];
- float d = (x1 - x0) * (x1 - x0) + (y1 - y0) * (y1 - y0) + (z1 - z0) * (z1 - z0);
- if (d < best) {
- best = d;
- best_i = j + i;
- }
- }
- __syncthreads();
-}
-```
-
-## Other issues
-In cluster when build cuda things, you may occur problem:
-- `gcc: error trying to exec 'cc1plus': execvp: No such file or directory`,
- Main reason is gcc and g++ version problem in cluster, you can try to install inside `conda` to solve that, with: `mamba install -c conda-forge gxx==9.5.0`. And the reason why I set 9.5.0 is because of the version for cuda 11.3 need inside specific version.
- ```bash
- RuntimeError: The current installed version of g++ (13.2.0) is greater than the maximum required version by CUDA 11.3. Please make sure to use an adequate version of g++ (>=5.0.0, <11.0).
- ```
diff --git a/assets/cuda/chamfer3D/__init__.py b/assets/cuda/chamfer3D/__init__.py
deleted file mode 100644
index 3aac3a5..0000000
--- a/assets/cuda/chamfer3D/__init__.py
+++ /dev/null
@@ -1,117 +0,0 @@
-"""
-# Created: 2023-08-04 11:20
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang (https://kin-zhang.github.io/)
-#
-# This file is part of SeFlow (https://github.com/KTH-RPL/SeFlow).
-# If you find this repo helpful, please cite the respective publication as
-# listed on the above website.
-#
-#
-# Description: ChamferDis speedup using CUDA
-"""
-from torch import nn
-from torch.autograd import Function
-import torch
-
-import os, time
-import chamfer3D
-BASE_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '../..' ))
-
-
-# GPU tensors only
-class ChamferDis(Function):
- @staticmethod
- def forward(ctx, pc0, pc1):
- # pc0: (N,3), pc1: (M,3)
- dis0 = torch.zeros(pc0.shape[0]).to(pc0.device).contiguous()
- dis1 = torch.zeros(pc1.shape[0]).to(pc1.device).contiguous()
-
- idx0 = torch.zeros(pc0.shape[0], dtype=torch.int32).to(pc0.device).contiguous()
- idx1 = torch.zeros(pc1.shape[0], dtype=torch.int32).to(pc1.device).contiguous()
-
-
- chamfer3D.forward(pc0, pc1, dis0, dis1, idx0, idx1)
- ctx.save_for_backward(pc0, pc1, idx0, idx1)
- return dis0, dis1, idx0, idx1
-
- @staticmethod
- def backward(ctx, grad_dist0, grad_dist1, grad_idx0, grad_idx1):
- pc0, pc1, idx0, idx1 = ctx.saved_tensors
- grad_dist0 = grad_dist0.contiguous()
- grad_dist1 = grad_dist1.contiguous()
- device = grad_dist1.device
-
- grad_pc0 = torch.zeros(pc0.size()).to(device).contiguous()
- grad_pc1 = torch.zeros(pc1.size()).to(device).contiguous()
-
- chamfer3D.backward(
- pc0, pc1, idx0, idx1, grad_dist0, grad_dist1, grad_pc0, grad_pc1
- )
- return grad_pc0, grad_pc1
-
-class nnChamferDis(nn.Module):
- def __init__(self, truncate_dist=True):
- super(nnChamferDis, self).__init__()
- self.truncate_dist = truncate_dist
-
- def forward(self, input0, input1, truncate_dist=-1):
- input0 = input0.contiguous()
- input1 = input1.contiguous()
- dist0, dist1, _, _ = ChamferDis.apply(input0, input1)
-
- if truncate_dist<=0:
- return torch.mean(dist0) + torch.mean(dist1)
-
- valid_mask0 = (dist0 <= truncate_dist)
- valid_mask1 = (dist1 <= truncate_dist)
- truncated_sum = torch.nanmean(dist0[valid_mask0]) + torch.nanmean(dist1[valid_mask1])
- return truncated_sum
-
- def dis_res(self, input0, input1):
- input0 = input0.contiguous()
- input1 = input1.contiguous()
- dist0, dist1, _, _ = ChamferDis.apply(input0, input1)
- return dist0, dist1
-
- def truncated_dis(self, input0, input1):
- # nsfp: truncated distance way is set >= 2 to 0 but not nanmean
- cham_x, cham_y = self.dis_res(input0, input1)
- cham_x[cham_x >= 2] = 0.0
- cham_y[cham_y >= 2] = 0.0
- return torch.mean(cham_x) + torch.mean(cham_y)
-
- def disid_res(self, input0, input1):
- input0 = input0.contiguous()
- input1 = input1.contiguous()
- dist0, dist1, idx0, idx1 = ChamferDis.apply(input0, input1)
- return dist0, dist1, idx0, idx1
-class NearestNeighborDis(nn.Module):
- def __init__(self):
- super(NearestNeighborDis, self).__init__()
-
- def forward(self, input0, input1):
- input0 = input0.contiguous()
- input1 = input1.contiguous()
- dist0, dist1, _, _ = ChamferDis.apply(input0, input1)
-
- return torch.mean(dist0[dist0 <= 2])
-
-if __name__ == "__main__":
- import numpy as np
- pc0 = np.load(f'{BASE_DIR}/assets/tests/test_pc0.npy')
- pc1 = np.load(f'{BASE_DIR}/assets/tests/test_pc1.npy')
- print('0: {:.3f}MB'.format(torch.cuda.memory_allocated()/1024**2))
- pc0 = torch.from_numpy(pc0[...,:3]).float().cuda().contiguous()
- pc1 = torch.from_numpy(pc1[...,:3]).float().cuda().contiguous()
- pc0.requires_grad = True
- pc1.requires_grad = True
- print(pc0.shape, "demo data: ", pc0[0])
- print(pc1.shape, "demo data: ", pc1[0])
- print('1: {:.3f}MB'.format(torch.cuda.memory_allocated()/1024**2))
-
- start_time = time.time()
- loss = nnChamferDis(truncate_dist=False)(pc0, pc1)
- loss.backward()
- print("loss: ", loss)
- print(f"Chamfer Distance Cal time: {(time.time() - start_time)*1000:.3f} ms")
\ No newline at end of file
diff --git a/assets/cuda/chamfer3D/chamfer3D.cu b/assets/cuda/chamfer3D/chamfer3D.cu
deleted file mode 100644
index 72af47d..0000000
--- a/assets/cuda/chamfer3D/chamfer3D.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (C) 2022-now, RPL, KTH Royal Institute of Technology
- * @Author: Qingwen Zhang (https://kin-zhang.github.io/)
- * @Date: 2023-08-03 16:55
- * @Description: Chamfer distance calculation between two point clouds with CUDA
- * This file is part of SeFlow (https://github.com/KTH-RPL/SeFlow).
- * If you find this repo helpful, please cite the respective publication as
- * listed on the above website.
-
-
- * Reference: Modified from SCOOP chamfer3D [https://github.com/itailang/SCOOP]
- * faster 2x than the original version
-*/
-
-#include
-#include
-#include
-#include
-
-#include
-#include
-
-#include
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
- for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
- i += blockDim.x * gridDim.x)
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-#define THREADS_PER_BLOCK 256
-
-
-__global__ void NmDistanceKernel(const int pc0_n, const float *pc0_xyz, const int pc1_n, const float *pc1_xyz, float *result, int *result_i){
- int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
- if (tid >= pc0_n) return;
-
- float x0 = pc0_xyz[tid * 3 + 0];
- float y0 = pc0_xyz[tid * 3 + 1];
- float z0 = pc0_xyz[tid * 3 + 2];
-
- __shared__ float shared_pc1[THREADS_PER_BLOCK * 3];
-
- int best_i = -1;
- float best = 1e20;
-
- for (int i = 0; i < pc1_n; i += THREADS_PER_BLOCK) {
- // Copy a block of pc1 to shared memory
- int pc1_idx = i + threadIdx.x;
- if (pc1_idx < pc1_n) {
- shared_pc1[threadIdx.x * 3 + 0] = pc1_xyz[pc1_idx * 3 + 0];
- shared_pc1[threadIdx.x * 3 + 1] = pc1_xyz[pc1_idx * 3 + 1];
- shared_pc1[threadIdx.x * 3 + 2] = pc1_xyz[pc1_idx * 3 + 2];
- }
-
- __syncthreads();
-
- // Compute the distance between pc0[tid] and the points in shared_pc1
- int num_elems = min(THREADS_PER_BLOCK, pc1_n - i);
- for (int j = 0; j < num_elems; j++) {
- float x1 = shared_pc1[j * 3 + 0];
- float y1 = shared_pc1[j * 3 + 1];
- float z1 = shared_pc1[j * 3 + 2];
- float d = (x1 - x0) * (x1 - x0) + (y1 - y0) * (y1 - y0) + (z1 - z0) * (z1 - z0);
- if (d < best) {
- best = d;
- best_i = j + i;
- }
- }
-
- __syncthreads();
- }
-
- // done with this thread in tid in pc_0, save the result to global memory
- atomicExch(&result[tid], best);
- atomicExch(&result_i[tid], best_i);
-}
-
-int chamfer_cuda_forward(const at::Tensor &pc0, const at::Tensor &pc1, at::Tensor &dist0, at::Tensor &dist1, at::Tensor &idx0, at::Tensor &idx1)
-{
- at::cuda::CUDAGuard device_guard(pc0.device());
- cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
- const int pc0_n = pc0.size(0);
- const int pc1_n = pc1.size(0);
-
- const int col_blocks_pc0 = DIVUP(pc0_n, THREADS_PER_BLOCK);
- dim3 blocks_pc0(col_blocks_pc0);
- const int col_blocks_pc1 = DIVUP(pc1_n, THREADS_PER_BLOCK);
- dim3 blocks_pc1(col_blocks_pc1);
- dim3 threads(THREADS_PER_BLOCK);
-
- NmDistanceKernel<<>>(pc0_n, pc0.data_ptr(), pc1_n, pc1.data_ptr(), dist0.data_ptr(), idx0.data_ptr());
- NmDistanceKernel<<>>(pc1_n, pc1.data_ptr(), pc0_n, pc0.data_ptr(), dist1.data_ptr(), idx1.data_ptr());
-
- AT_CUDA_CHECK(cudaGetLastError());
-
- return 1;
-}
-
-__global__ void NmDistanceGradKernel(const int pc0_n, const float *pc0_xyz, const int pc1_n, const float *pc1_xyz,
- const float *grad_dist0, const int *idx0, float *grad_pc0, float *grad_pc1)
-{
- CUDA_1D_KERNEL_LOOP(j0, pc0_n){
- float x0 = pc0_xyz[j0 * 3 + 0];
- float y0 = pc0_xyz[j0 * 3 + 1];
- float z0 = pc0_xyz[j0 * 3 + 2];
-
- int j1 = idx0[j0];
- float x1 = pc1_xyz[j1 * 3 + 0];
- float y1 = pc1_xyz[j1 * 3 + 1];
- float z1 = pc1_xyz[j1 * 3 + 2];
-
- float g = grad_dist0[j0] * 2;
-
- atomicAdd(&grad_pc0[j0 * 3 + 0], g * (x0 - x1));
- atomicAdd(&grad_pc0[j0 * 3 + 1], g * (y0 - y1));
- atomicAdd(&grad_pc0[j0 * 3 + 2], g * (z0 - z1));
-
- atomicAdd(&grad_pc1[j1 * 3 + 0], - (g * (x0 - x1)));
- atomicAdd(&grad_pc1[j1 * 3 + 1], - (g * (y0 - y1)));
- atomicAdd(&grad_pc1[j1 * 3 + 2], - (g * (z0 - z1)));
- }
-}
-int chamfer_cuda_backward(const at::Tensor &pc0, const at::Tensor &pc1,
- const at::Tensor &idx0, const at::Tensor &idx1,
- at::Tensor &grad_dist0, at::Tensor &grad_dist1,
- at::Tensor &grad_pc0, at::Tensor &grad_pc1)
-{
- at::cuda::CUDAGuard device_guard(pc0.device());
- cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
- const int pc0_n = pc0.size(0);
- const int pc1_n = pc1.size(0);
-
- const int col_blocks_pc0 = DIVUP(pc0_n, THREADS_PER_BLOCK);
- dim3 blocks_pc0(col_blocks_pc0);
- const int col_blocks_pc1 = DIVUP(pc1_n, THREADS_PER_BLOCK);
- dim3 blocks_pc1(col_blocks_pc1);
- dim3 threads(THREADS_PER_BLOCK);
-
- NmDistanceGradKernel<<>>(pc0_n, pc0.data_ptr(), pc1_n, pc1.data_ptr(), grad_dist0.data_ptr(), idx0.data_ptr(), grad_pc0.data_ptr(), grad_pc1.data_ptr());
- NmDistanceGradKernel<<>>(pc1_n, pc1.data_ptr(), pc0_n, pc0.data_ptr(), grad_dist1.data_ptr(), idx1.data_ptr(), grad_pc1.data_ptr(), grad_pc0.data_ptr());
-
- AT_CUDA_CHECK(cudaGetLastError());
-
- return 1;
-}
\ No newline at end of file
diff --git a/assets/cuda/chamfer3D/chamfer3D_cuda.cpp b/assets/cuda/chamfer3D/chamfer3D_cuda.cpp
deleted file mode 100644
index 7a89216..0000000
--- a/assets/cuda/chamfer3D/chamfer3D_cuda.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2022-now, RPL, KTH Royal Institute of Technology
- * @Author: Qingwen Zhang (https://kin-zhang.github.io/)
- * @Date: 2023-08-03 16:55
- * @Description: Chamfer distance calculation between two point clouds with CUDA
- * This file is part of SeFlow (https://github.com/KTH-RPL/SeFlow).
- * If you find this repo helpful, please cite the respective publication as
- * listed on the above website.
-
-
- * Reference: Modified from SCOOP chamfer3D [https://github.com/itailang/SCOOP]
- * faster 2x than the original version
-*/
-
-#include
-#include
-
-int chamfer_cuda_forward(
- const at::Tensor &pc0,
- const at::Tensor &pc1,
- at::Tensor &dist0,
- at::Tensor &dist1,
- at::Tensor &idx0,
- at::Tensor &idx1);
-
-int chamfer_cuda_backward(
- const at::Tensor &pc0, const at::Tensor &pc1,
- const at::Tensor &idx0, const at::Tensor &idx1,
- at::Tensor &grad_dist0, at::Tensor &grad_dist1,
- at::Tensor &grad_pc0, at::Tensor &grad_pc1);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
- m.def("forward", &chamfer_cuda_forward, "Chamfer Distance (CUDA)");
- m.def("backward", &chamfer_cuda_backward, "Chamfer Distance (CUDA) Backward Grad");
-}
\ No newline at end of file
diff --git a/assets/cuda/chamfer3D/setup.py b/assets/cuda/chamfer3D/setup.py
deleted file mode 100755
index 2c16070..0000000
--- a/assets/cuda/chamfer3D/setup.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-setup(
- name='chamfer3D',
- ext_modules=[
- CUDAExtension('chamfer3D', [
- "/".join(__file__.split('/')[:-1] + ['chamfer3D_cuda.cpp']), # must named as xxx_cuda.cpp
- "/".join(__file__.split('/')[:-1] + ['chamfer3D.cu']),
- ]),
- ],
- cmdclass={
- 'build_ext': BuildExtension
- },
- version='1.0.1')
diff --git a/assets/cuda/mmcv/README.md b/assets/cuda/mmcv/README.md
deleted file mode 100644
index 5fa6799..0000000
--- a/assets/cuda/mmcv/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-mmcv
----
-
-This file extracted functions we need used in mmcv to release the dependency of mmcv-full and faster the installation process.
\ No newline at end of file
diff --git a/assets/cuda/mmcv/__init__.py b/assets/cuda/mmcv/__init__.py
deleted file mode 100755
index dca7091..0000000
--- a/assets/cuda/mmcv/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .voxelize import Voxelization, voxelization
-from .scatter_points import DynamicScatter, dynamic_scatter
-
-__all__ = [
- 'Voxelization', 'voxelization',
- 'dynamic_scatter', 'DynamicScatter'
-]
diff --git a/assets/cuda/mmcv/common_cuda_helper.hpp b/assets/cuda/mmcv/common_cuda_helper.hpp
deleted file mode 100644
index b12aa9a..0000000
--- a/assets/cuda/mmcv/common_cuda_helper.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef COMMON_CUDA_HELPER
-#define COMMON_CUDA_HELPER
-
-#include
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
- for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
- i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
- for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
- i += blockDim.x * gridDim.x) \
- for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
- j += blockDim.y * gridDim.y)
-
-#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \
- for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
- for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
-
-#define THREADS_PER_BLOCK 512
-
-inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
- int optimal_block_num = (N + num_threads - 1) / num_threads;
- int max_block_num = 4096;
- return min(optimal_block_num, max_block_num);
-}
-
-template
-__device__ T bilinear_interpolate(const T* input, const int height,
- const int width, T y, T x,
- const int index /* index for debug only*/) {
- // deal with cases that inverse elements are out of feature map boundary
- if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
-
- if (y <= 0) y = 0;
- if (x <= 0) x = 0;
-
- int y_low = (int)y;
- int x_low = (int)x;
- int y_high;
- int x_high;
-
- if (y_low >= height - 1) {
- y_high = y_low = height - 1;
- y = (T)y_low;
- } else {
- y_high = y_low + 1;
- }
-
- if (x_low >= width - 1) {
- x_high = x_low = width - 1;
- x = (T)x_low;
- } else {
- x_high = x_low + 1;
- }
-
- T ly = y - y_low;
- T lx = x - x_low;
- T hy = 1. - ly, hx = 1. - lx;
- // do bilinear interpolation
- T v1 = input[y_low * width + x_low];
- T v2 = input[y_low * width + x_high];
- T v3 = input[y_high * width + x_low];
- T v4 = input[y_high * width + x_high];
- T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
- T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
- return val;
-}
-
-template
-__device__ void bilinear_interpolate_gradient(
- const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
- int& x_low, int& x_high, int& y_low, int& y_high,
- const int index /* index for debug only*/) {
- // deal with cases that inverse elements are out of feature map boundary
- if (y < -1.0 || y > height || x < -1.0 || x > width) {
- // empty
- w1 = w2 = w3 = w4 = 0.;
- x_low = x_high = y_low = y_high = -1;
- return;
- }
-
- if (y <= 0) y = 0;
- if (x <= 0) x = 0;
-
- y_low = (int)y;
- x_low = (int)x;
-
- if (y_low >= height - 1) {
- y_high = y_low = height - 1;
- y = (T)y_low;
- } else {
- y_high = y_low + 1;
- }
-
- if (x_low >= width - 1) {
- x_high = x_low = width - 1;
- x = (T)x_low;
- } else {
- x_high = x_low + 1;
- }
-
- T ly = y - y_low;
- T lx = x - x_low;
- T hy = 1. - ly, hx = 1. - lx;
-
- // reference in forward
- // T v1 = input[y_low * width + x_low];
- // T v2 = input[y_low * width + x_high];
- // T v3 = input[y_high * width + x_low];
- // T v4 = input[y_high * width + x_high];
- // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
- w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
- return;
-}
-#endif // COMMON_CUDA_HELPER
diff --git a/assets/cuda/mmcv/cudabind.cpp b/assets/cuda/mmcv/cudabind.cpp
deleted file mode 100644
index 6dd27f1..0000000
--- a/assets/cuda/mmcv/cudabind.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
-
-void DynamicVoxelizeForwardCUDAKernelLauncher(
- const at::Tensor &points, at::Tensor &coors,
- const std::vector voxel_size, const std::vector coors_range,
- const int NDim = 3);
-
-void dynamic_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &coors,
- const std::vector voxel_size,
- const std::vector coors_range,
- const int NDim) {
- DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
- coors_range, NDim);
-};
-
-void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
- const std::vector voxel_size,
- const std::vector coors_range,
- const int NDim);
-
-
-int HardVoxelizeForwardCUDAKernelLauncher(
- const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
- at::Tensor &num_points_per_voxel, const std::vector voxel_size,
- const std::vector coors_range, const int max_points,
- const int max_voxels, const int NDim = 3);
-
-int hard_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &voxels,
- at::Tensor &coors,
- at::Tensor &num_points_per_voxel,
- const std::vector voxel_size,
- const std::vector coors_range,
- const int max_points, const int max_voxels,
- const int NDim) {
- return HardVoxelizeForwardCUDAKernelLauncher(
- points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
- max_points, max_voxels, NDim);
-};
-
-int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
- at::Tensor &coors,
- at::Tensor &num_points_per_voxel,
- const std::vector voxel_size,
- const std::vector coors_range,
- const int max_points, const int max_voxels,
- const int NDim);
-
-int nondeterministic_hard_voxelize_forward_impl(
- const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
- at::Tensor &num_points_per_voxel, const std::vector voxel_size,
- const std::vector coors_range, const int max_points,
- const int max_voxels, const int NDim);
-
-REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
- hard_voxelize_forward_cuda);
-REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
- dynamic_voxelize_forward_cuda);
-
-
-std::vector DynamicPointToVoxelForwardCUDAKernelLauncher(
- const at::Tensor &feats, const at::Tensor &coors,
- const reduce_t reduce_type);
-
-
-std::vector dynamic_point_to_voxel_forward_cuda(
- const torch::Tensor &feats, const torch::Tensor &coors,
- const reduce_t reduce_type) {
- return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
- reduce_type);
-};
-
-void DynamicPointToVoxelBackwardCUDAKernelLauncher(
- at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
- const at::Tensor &feats, const at::Tensor &reduced_feats,
- const at::Tensor &coors_map, const at::Tensor &reduce_count,
- const reduce_t reduce_type);
-
-void dynamic_point_to_voxel_backward_cuda(
- torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
- const torch::Tensor &feats, const torch::Tensor &reduced_feats,
- const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
- const reduce_t reduce_type) {
- DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
- feats, reduced_feats, coors_idx,
- reduce_count, reduce_type);
-};
-
-std::vector dynamic_point_to_voxel_forward_impl(
- const torch::Tensor &feats, const torch::Tensor &coors,
- const reduce_t reduce_type);
-
-void dynamic_point_to_voxel_backward_impl(
- torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
- const torch::Tensor &feats, const torch::Tensor &reduced_feats,
- const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
- const reduce_t reduce_type);
-
-REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
- dynamic_point_to_voxel_forward_cuda);
-REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
- dynamic_point_to_voxel_backward_cuda);
\ No newline at end of file
diff --git a/assets/cuda/mmcv/pybind.cpp b/assets/cuda/mmcv/pybind.cpp
deleted file mode 100644
index 1b0823b..0000000
--- a/assets/cuda/mmcv/pybind.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include
-
-#include "pytorch_cpp_helper.hpp"
-
-
-std::vector dynamic_point_to_voxel_forward(
- const torch::Tensor &feats, const torch::Tensor &coors,
- const std::string &reduce_type);
-
-void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
- const torch::Tensor &grad_reduced_feats,
- const torch::Tensor &feats,
- const torch::Tensor &reduced_feats,
- const torch::Tensor &coors_idx,
- const torch::Tensor &reduce_count,
- const std::string &reduce_type);
-
-void dynamic_voxelize_forward(const at::Tensor &points,
- const at::Tensor &voxel_size,
- const at::Tensor &coors_range, at::Tensor &coors,
- const int NDim);
-
-void hard_voxelize_forward(const at::Tensor &points,
- const at::Tensor &voxel_size,
- const at::Tensor &coors_range, at::Tensor &voxels,
- at::Tensor &coors, at::Tensor &num_points_per_voxel,
- at::Tensor &voxel_num, const int max_points,
- const int max_voxels, const int NDim,
- const bool deterministic);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
- m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward,
- "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"),
- py::arg("reduce_type"));
- m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward,
- "dynamic_point_to_voxel_backward", py::arg("grad_feats"),
- py::arg("grad_reduced_feats"), py::arg("feats"),
- py::arg("reduced_feats"), py::arg("coors_idx"), py::arg("reduce_count"),
- py::arg("reduce_type"));
- m.def("hard_voxelize_forward", &hard_voxelize_forward,
- "hard_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
- py::arg("coors_range"), py::arg("voxels"), py::arg("coors"),
- py::arg("num_points_per_voxel"), py::arg("voxel_num"),
- py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"),
- py::arg("deterministic"));
- m.def("dynamic_voxelize_forward", &dynamic_voxelize_forward,
- "dynamic_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
- py::arg("coors_range"), py::arg("coors"), py::arg("NDim"));
-}
\ No newline at end of file
diff --git a/assets/cuda/mmcv/pytorch_cpp_helper.hpp b/assets/cuda/mmcv/pytorch_cpp_helper.hpp
deleted file mode 100644
index f68e874..0000000
--- a/assets/cuda/mmcv/pytorch_cpp_helper.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef PYTORCH_CPP_HELPER
-#define PYTORCH_CPP_HELPER
-#include
-
-#include
-
-using namespace at;
-
-#define CHECK_CUDA(x) \
- TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_MLU(x) \
- TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
-#define CHECK_CPU(x) \
- TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
-#define CHECK_CONTIGUOUS(x) \
- TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_CUDA_INPUT(x) \
- CHECK_CUDA(x); \
- CHECK_CONTIGUOUS(x)
-#define CHECK_MLU_INPUT(x) \
- CHECK_MLU(x); \
- CHECK_CONTIGUOUS(x)
-#define CHECK_CPU_INPUT(x) \
- CHECK_CPU(x); \
- CHECK_CONTIGUOUS(x)
-
-#endif // PYTORCH_CPP_HELPER
diff --git a/assets/cuda/mmcv/pytorch_cuda_helper.hpp b/assets/cuda/mmcv/pytorch_cuda_helper.hpp
deleted file mode 100644
index 52e5126..0000000
--- a/assets/cuda/mmcv/pytorch_cuda_helper.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef PYTORCH_CUDA_HELPER
-#define PYTORCH_CUDA_HELPER
-
-#include
-#include
-#include
-
-#include
-#include
-
-#include "common_cuda_helper.hpp"
-
-using at::Half;
-using at::Tensor;
-using phalf = at::Half;
-
-#define __PHALF(x) (x)
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-#endif // PYTORCH_CUDA_HELPER
diff --git a/assets/cuda/mmcv/pytorch_device_registry.hpp b/assets/cuda/mmcv/pytorch_device_registry.hpp
deleted file mode 100644
index 2a32b72..0000000
--- a/assets/cuda/mmcv/pytorch_device_registry.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-#ifndef PYTORCH_DEVICE_REGISTRY_H
-#define PYTORCH_DEVICE_REGISTRY_H
-
-// Using is recommended in the official documentation in
-// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
-// However, we use for compatibility with CUDA 9.0
-// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
-#include
-
-#include
-#include
-#include