diff --git a/.github/issue_stale.yaml b/.github/issue_stale.yaml
deleted file mode 100644
index 17948d3..0000000
--- a/.github/issue_stale.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Number of days of inactivity before an issue becomes stale
-daysUntilStale: 7
-# Number of days of inactivity before a stale issue is closed
-daysUntilClose: 7
-# Issues with these labels will never be considered stale
-exemptLabels:
-  - backlog
-# Label to use when marking an issue as stale
-staleLabel: stale
-# Comment to post when marking an issue as stale. Set to `false` to disable
-markComment: >
-  This issue has been automatically marked as stale because it has not had
-  recent activity. It will be closed if no further activity occurs. Thank you
-  for your contributions.
-# Comment to post when closing a stale issue. Set to `false` to disable
-closeComment: false
\ No newline at end of file
diff --git a/.github/workflow/issue_stale.yaml b/.github/workflow/issue_stale.yaml
new file mode 100644
index 0000000..7c11c2b
--- /dev/null
+++ b/.github/workflow/issue_stale.yaml
@@ -0,0 +1,30 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "35 11 * * 5"
+
+env:
+  DAYS_BEFORE_ISSUE_STALE: 30
+  DAYS_BEFORE_ISSUE_CLOSE: 14
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v5
+        with:
+          days-before-issue-stale: ${{ env.DAYS_BEFORE_ISSUE_STALE }}
+          days-before-issue-close: ${{ env.DAYS_BEFORE_ISSUE_CLOSE }}
+          stale-issue-label: "stale"
+          stale-issue-message: |
+            This issue is stale because it has been open for ${{ env.DAYS_BEFORE_ISSUE_STALE }} days with no activity. 
+            It will be closed if no further activity occurs. Let us know if you still need help!
+          close-issue-message: |
+            This issue is being closed because it has been stale for ${{ env.DAYS_BEFORE_ISSUE_CLOSE }} days with no activity. 
+            If you still need help, please feel free to leave comments.
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..0bd27a2
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "OpenSceneFlow"]
+	path = OpenSceneFlow
+	url = https://github.com/KTH-RPL/OpenSceneFlow.git
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index b80f234..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,43 +0,0 @@
-# check more: https://hub.docker.com/r/nvidia/cuda
-FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
-ENV DEBIAN_FRONTEND noninteractive
-
-RUN apt update && apt install -y --no-install-recommends \
-    git curl vim rsync htop
-
-RUN curl -o ~/miniconda.sh -LO https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-    chmod +x ~/miniconda.sh && \
-    ~/miniconda.sh -b -p /opt/conda && \
-    rm ~/miniconda.sh && \
-    /opt/conda/bin/conda clean -ya && /opt/conda/bin/conda init bash
-
-RUN curl -o ~/mamba.sh -LO https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \
-    chmod +x ~/mamba.sh && \
-    ~/mamba.sh -b -p /opt/mambaforge && \
-    rm ~/mamba.sh && /opt/mambaforge/bin/mamba init bash
-
-# install zsh and oh-my-zsh
-RUN apt install -y wget git zsh tmux vim g++
-RUN sh -c "$(wget -O- https://github.com/deluan/zsh-in-docker/releases/download/v1.1.5/zsh-in-docker.sh)" -- \
-    -t robbyrussell -p git \
-    -p https://github.com/agkozak/zsh-z \
-    -p https://github.com/zsh-users/zsh-autosuggestions \
-    -p https://github.com/zsh-users/zsh-completions \
-    -p https://github.com/zsh-users/zsh-syntax-highlighting
-    
-RUN printf "y\ny\ny\n\n" | bash -c "$(curl -fsSL https://raw.githubusercontent.com/Kin-Zhang/Kin-Zhang/main/scripts/setup_ohmyzsh.sh)"
-RUN /opt/conda/bin/conda init zsh && /opt/mambaforge/bin/mamba init zsh
-
-# change to conda env
-ENV PATH /opt/conda/bin:$PATH
-ENV PATH /opt/mambaforge/bin:$PATH
-
-RUN mkdir -p /home/kin/workspace && cd /home/kin/workspace && git clone https://github.com/KTH-RPL/SeFlow.git
-WORKDIR /home/kin/workspace/SeFlow
-RUN apt-get update && apt-get install libgl1 -y
-# need read the gpu device info to compile the cuda extension
-RUN cd /home/kin/workspace/SeFlow && /opt/mambaforge/bin/mamba env create -f environment.yaml
-RUN cd /home/kin/workspace/SeFlow/assets/cuda/mmcv && /opt/mambaforge/envs/seflow/bin/python ./setup.py install
-RUN cd /home/kin/workspace/SeFlow/assets/cuda/chamfer3D && /opt/mambaforge/envs/seflow/bin/python ./setup.py install
-
-
diff --git a/OpenSceneFlow b/OpenSceneFlow
new file mode 160000
index 0000000..6226b07
--- /dev/null
+++ b/OpenSceneFlow
@@ -0,0 +1 @@
+Subproject commit 6226b07784dbd6f1aa52931d419a7d3707b4e3d3
diff --git a/README.md b/README.md
index d2fa38d..49fd33a 100644
--- a/README.md
+++ b/README.md
@@ -8,70 +8,36 @@ SeFlow: A Self-Supervised Scene Flow Method in Autonomous Driving
 
 ![](assets/docs/seflow_arch.png)
 
-2024/11/18 16:17: Update model and demo data download link through HuggingFace, Personally I found `wget` from HuggingFace link is much faster than Zenodo.
-
-2024/09/26 16:24: All codes already uploaded and tested. You can to try training directly by downloading (through [HuggingFace](https://huggingface.co/kin-zhang/OpenSceneFlow)/[Zenodo](https://zenodo.org/records/13744999)) demo data or pretrained weight for evaluation. 
-
-Pre-trained weights for models are available in [Zenodo](https://zenodo.org/records/13744999)/[HuggingFace](https://huggingface.co/kin-zhang/OpenSceneFlow) link. Check usage in [2. Evaluation](#2-evaluation) or [3. Visualization](#3-visualization).
-
 Task: __Self-Supervised__ Scene Flow Estimation in Autonomous Driving. No human-label needed. Real-time inference (15-20Hz in RTX3090).
 
-We directly follow our previous work [code structure](https://github.com/KTH-RPL/DeFlow), so you may want to start from the easier one with supervised learning first: Try [DeFlow](https://github.com/KTH-RPL/DeFlow). Then you will find this is simple to you (things about how to train under self-supervised). Here are **Scripts** quick view in this repo:
-
-- `dataprocess/extract_*.py` : pre-process data before training to speed up the whole training time. 
-  [Dataset we included now: Argoverse 2 and Waymo.  more on the way: Nuscenes, custom data.]
-  
-- `process.py`: process data with save dufomap, cluster labels inside file. Only needed once for training.
-
-- `train.py`: Train the model and get model checkpoints. Pls remember to check the config.
-
-- `eval.py` : Evaluate the model on the validation/test set. And also output the zip file to upload to online leaderboard.
-
-- `save.py` : Will save result into h5py file, using [tool/visualization.py] to show results with interactive window.
-
-<details> <summary>🎁 <b>One repository, All methods!</b> </summary>
-<!-- <br> -->
-You can try following methods in our code without any effort to make your own benchmark.
+📜 2025/02/18: Merging all scene flow code to [OpenSceneFLow codebase](https://github.com/KTH-RPL/OpenSceneFlow) for afterward code maintenance. This repo saved README, [cluster slurm files](assets/slurm), and [quick core file](lossfunc.py) in SeFlow. The old source code branch is also [available here](https://github.com/KTH-RPL/SeFlow/tree/source).
 
-- [x] [SeFlow](https://arxiv.org/abs/2407.01702) (Ours 🚀): ECCV 2024
-- [x] [DeFlow](https://arxiv.org/abs/2401.16122) (Ours 🚀): ICRA 2024
-- [x] [FastFlow3d](https://arxiv.org/abs/2103.01306): RA-L 2021
-- [x] [ZeroFlow](https://arxiv.org/abs/2305.10424): ICLR 2024, their pre-trained weight can covert into our format easily through [the script](tools/zerof2ours.py).
-- [ ] [NSFP](https://arxiv.org/abs/2111.01253): NeurIPS 2021, faster 3x than original version because of [our CUDA speed up](assets/cuda/README.md), same (slightly better) performance. Done coding, public after review.
-- [ ] [FastNSF](https://arxiv.org/abs/2304.09121): ICCV 2023. Done coding, public after review.
-<!-- - [ ] [Flow4D](https://arxiv.org/abs/2407.07995): 1st supervise network in the new leaderboard. Done coding, public after review. -->
-- [ ] ... more on the way
+2024/11/18 16:17: Update model and demo data download link through HuggingFace, Personally I found `wget` from HuggingFace link is much faster than Zenodo.
 
-</details>
+2024/09/26 16:24: All codes already uploaded and tested. You can to try training directly by downloading (through [HuggingFace](https://huggingface.co/kin-zhang/OpenSceneFlow)/[Zenodo](https://zenodo.org/records/13744999)) demo data or pretrained weight for evaluation. 
 
-💡: Want to learn how to add your own network in this structure? Check [Contribute](assets/README.md#contribute) section and know more about the code. Fee free to pull request!
+Pre-trained weights for models are available in [Zenodo](https://zenodo.org/records/13744999)/[HuggingFace](https://huggingface.co/kin-zhang/OpenSceneFlow) link. Check usage in [2. Evaluation](#2-evaluation) or [3. Visualization](#3-visualization).
 
 ## 0. Setup
 
-**Environment**: Same to [DeFlow](https://github.com/KTH-RPL/DeFlow). And even lighter here with extracting mmcv module we needed into cuda assets.
+**Environment**: Clone the repo and build the environment, check [detail installation](https://github.com/KTH-RPL/OpenSceneFlow/assets/README.md) for more information. [Conda](https://docs.conda.io/projects/miniconda/en/latest/)/[Mamba](https://github.com/mamba-org/mamba) is recommended.
+
 
 ```bash
-git clone --recursive https://github.com/KTH-RPL/SeFlow.git
-cd SeFlow && mamba env create -f environment.yaml
+git clone --recursive https://github.com/KTH-RPL/OpenSceneFlow.git
+cd OpenSceneFlow
+mamba env create -f environment.yaml
 ```
 
 CUDA package (need install nvcc compiler), the compile time is around 1-5 minutes:
 ```bash
-mamba activate seflow
+mamba activate opensf
 # CUDA already install in python environment. I also tested others version like 11.3, 11.4, 11.7, 11.8 all works
 cd assets/cuda/mmcv && python ./setup.py install && cd ../../..
 cd assets/cuda/chamfer3D && python ./setup.py install && cd ../../..
 ```
 
-Or you always can choose [Docker](https://en.wikipedia.org/wiki/Docker_(software)) which isolated environment and free yourself from installation, you can pull it by. 
-If you have different arch, please build it by yourself `cd SeFlow && docker build -t zhangkin/seflow` by going through [build-docker-image](https://github.com/KTH-RPL/DeFlow/blob/main/assets/README.md/#build-docker-image) section.
-```bash
-# option 1: pull from docker hub
-docker pull zhangkin/seflow
-
-# run container
-docker run -it --gpus all -v /dev/shm:/dev/shm -v /home/kin/data:/home/kin/data --name seflow zhangkin/seflow /bin/zsh
-```
+Or another environment setup choice is [Docker](https://en.wikipedia.org/wiki/Docker_(software)) which isolated environment, check more information in [OpenSceneFlow/assets/README.md](https://github.com/KTH-RPL/OpenSceneFlow/assets/README.md#docker-environment).
 
 ## 1. Run & Train
 
@@ -87,16 +53,6 @@ wget https://huggingface.co/kin-zhang/OpenSceneFlow/resolve/main/demo_data.zip
 unzip demo_data.zip -p /home/kin/data/av2
 ```
 
-#### Prepare raw data 
-
-Checking more information (step for downloading raw data, storage size, #frame etc) in [dataprocess/README.md](dataprocess/README.md). Extract all data to unified `.h5` format. 
-[Runtime: Normally need 45 mins finished run following commands totally in setup mentioned in our paper]
-```bash
-python dataprocess/extract_av2.py --av2_type sensor --data_mode train --argo_dir /home/kin/data/av2 --output_dir /home/kin/data/av2/preprocess_v2
-python dataprocess/extract_av2.py --av2_type sensor --data_mode val --mask_dir /home/kin/data/av2/3d_scene_flow
-python dataprocess/extract_av2.py --av2_type sensor --data_mode test --mask_dir /home/kin/data/av2/3d_scene_flow
-```
-
 #### Process train data
 
 Process train data for self-supervised learning. Only training data needs this step. [Runtime: Normally need 15 hours for my desktop, 3 hours for the cluster with five available nodes parallel running.]
diff --git a/assets/README.md b/assets/README.md
deleted file mode 100644
index 07cc3cd..0000000
--- a/assets/README.md
+++ /dev/null
@@ -1,78 +0,0 @@
-SeFlow Assets
----
-
-There are two ways to setup the environment: conda in your desktop and docker container isolate environment.
-
-## Docker
-
-Docker installation check [DeFlow Assets](https://github.com/KTH-RPL/DeFlow/blob/main/assets/README.md#docker-environment). Then you can build and run the container by:
-
-```bash
-cd SeFlow
-docker build -t zhangkin/seflow .
-
-docker run -it --gpus all -v /dev/shm:/dev/shm -v /home/kin/data:/home/kin/data --name seflow zhangkin/seflow /bin/zsh
-```
-   
-## Installation
-
-We will use conda to manage the environment with mamba for faster package installation.
-
-### System
-Install conda with mamba for package management and  for faster package installation:
-```bash
-curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
-bash Miniforge3-$(uname)-$(uname -m).sh
-```
-
-### Environment
-
-Create base env: [~5 mins]
-
-```bash
-git clone https://github.com/KTH-RPL/SeFlow.git
-mamba env create -f assets/environment.yml
-```
-
-CUDA package (nvcc compiler already installed through conda), the compile time is around 1-5 minutes:
-```bash
-mamba activate seflow
-cd assets/cuda/mmcv && python ./setup.py install && cd ../../..
-cd assets/cuda/chamfer3D && python ./setup.py install && cd ../../..
-```
-
-
-Checking important packages in our environment now:
-```bash
-mamba activate seflow
-python -c "import torch; print(torch.__version__); print(torch.cuda.is_available()); print(torch.version.cuda)"
-python -c "import lightning.pytorch as pl; print(pl.__version__)"
-python -c "from assets.cuda.mmcv import Voxelization, DynamicScatter;print('successfully import on our lite mmcv package')"
-python -c "from assets.cuda.chamfer3D import nnChamferDis;print('successfully import on our chamfer3D package')"
-```
-
-
-### Other issues
-
-1. looks like open3d and fire package conflict, not sure
-   -  need install package like `pip install --ignore-installed`, ref: [pip cannot install distutils installed project](https://stackoverflow.com/questions/53807511/pip-cannot-uninstall-package-it-is-a-distutils-installed-project), my error: `ERROR: Cannot uninstall 'blinker'.`
-   -  need specific werkzeug version for open3d 0.16.0, otherwise error: `ImportError: cannot import name 'url_quote' from 'werkzeug.urls'`. But need update to solve the problem: `pip install --upgrade Flask` [ref](https://stackoverflow.com/questions/77213053/why-did-flask-start-failing-with-importerror-cannot-import-name-url-quote-fr)
-
-
-2. `ImportError: libtorch_cuda.so: undefined symbol: cudaGraphInstantiateWithFlags, version libcudart.so.11.0`
-   The cuda version: `pytorch::pytorch-cuda` and `nvidia::cudatoolkit` need be same. [Reference link](https://github.com/pytorch/pytorch/issues/90673#issuecomment-1563799299)
-
-
-3. In cluster have error: `pandas ImportError: /lib64/libstdc++.so.6: version 'GLIBCXX_3.4.29' not found`
-    Solved by `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/proj/berzelius-2023-154/users/x_qinzh/mambaforge/lib`
-
-
-## Contribute
-
-If you want to contribute to new model, here are tips you can follow:
-1. Dataloader: we believe all data could be process to `.h5`, we named as different scene and inside a scene, the key of each data is timestamp. Check [dataprocess/README.md](../dataprocess/README.md#process) for more details.
-2. Model: All model files can be found [here: src/models](../src/models). You can view deflow and fastflow3d to know how to implement a new model. Don't forget to add to the `__init__.py` [file to import class](../src/models/__init__.py).
-3. Loss: All loss files can be found [here: src/lossfuncs.py](../src/lossfuncs.py). There are three loss functions already inside the file, you can add a new one following the same pattern.
-4. Training: Once you have implemented the model, you can add the model to the config file [here: conf/model](../conf/model) and train the model using the command `python train.py model=your_model_name`. One more note here may: if your res_dict from model output is different, you may need add one pattern in `def training_step` and `def validation_step`.
-
-All others like eval and vis will be changed according to the model you implemented as you follow the above steps.
\ No newline at end of file
diff --git a/assets/cuda/README.md b/assets/cuda/README.md
deleted file mode 100644
index 621bcd3..0000000
--- a/assets/cuda/README.md
+++ /dev/null
@@ -1,21 +0,0 @@
-My CUDA library
----
-
-Faster our code in CUDA.
-
-- chamfer3D: 3D chamfer distance within two point cloud, by Qingwen Zhang involved when she was working on SeFlow.
-- mmcv: directly from mmcv, not our code.
-
----
-
-Quick View about CUDA speed on our chamfer3D (Faster 60x than others):
-
-The number of points: (pc0: 88132, pc1: 88101)
-
-| Function | Time (ms) |
-| :---: | :---: |
-| Faiss | 817.698 |
-| CUDA([SCOOP](https://github.com/itailang/SCOOP/tree/master/auxiliary/ChamferDistancePytorch), Batch) | 83.275 |
-| Pytorch3D | 68.256 |
-| CUDA([SeFlow](https://github.com/KTH-RPL/SeFlow), SharedM) | **14.308** |
-| ~~mmcv~~(chamfer2D) | 651.510 |
\ No newline at end of file
diff --git a/assets/cuda/chamfer3D/README.md b/assets/cuda/chamfer3D/README.md
deleted file mode 100644
index abf8274..0000000
--- a/assets/cuda/chamfer3D/README.md
+++ /dev/null
@@ -1,155 +0,0 @@
-CUDA with Torch 初步尝试
----
-
-主要参考都是mmcv里的库 and [SCOOP](https://github.com/itailang/SCOOP/blob/master/auxiliary/ChamferDistancePytorch/chamfer3D/)，只是想着仅提取这一个功能试一下，然后发现几个有意思的点 
-CUDA with Torch C++ Programming, [torch official ref link](https://pytorch.org/tutorials/advanced/cpp_extension.html)
-
-1. `.cu` 不能被`.cpp` include，否则失去cu属性
-2. `.cpp`必须命名为`xxx_cuda.cpp` 否则torch CUDAextension不会找
-3. `.cu` 的include和平常的cuda编程有所不同：必须先include ATen然后再正常的导入 CUDA等库 [ref link](https://blog.csdn.net/weixin_39849839/article/details/125980694)
-	```cpp
-	#include <ATen/ATen.h>
-	
-	#include <cuda.h>
-	#include <cuda_runtime.h>
-	```
-	
-注意要自己多写一个lib_xxx.py 内含class以方便调用，但是class内的forward函数必须有 `ctx` 参数，否则会报错
-
-
-## Install
-```bash
-# change it if you use different cuda version
-export PATH=/usr/local/cuda-11.3/bin:$PATH
-export LD_LIBRARY_PATH=/usr/local/cuda-11.3/lib64:$LD_LIBRARY_PATH
-
-cd assets/cuda/chamfer3Dlib
-python ./setup.py install
-
-# then you will see
-Installed /home/kin/mambaforge/envs/seflow/lib/python3.8/site-packages/chamfer3D-1.0.0-py3.8-linux-x86_64.egg
-Processing dependencies for chamfer3D==1.0.0
-Finished processing dependencies for chamfer3D==1.0.0
-
-# then run with lib_voxelize.py to see if it works
-python ../chamfer_cuda.py
-```
-
-## ChamferDis Speed
-
-The number of points: (pc0: 88132, pc1: 88101)
-
-| Function | Time (ms) |
-| :---: | :---: |
-| Faiss | 817.698 |
-| CUDA([SCOOP](https://github.com/itailang/SCOOP/tree/master/auxiliary/ChamferDistancePytorch), Batch) | 83.275 |
-| Pytorch3D | 68.256 |
-| CUDA([SeFlow](https://github.com/KTH-RPL/SeFlow), SharedM) | **14.308** |
-| ~~mmcv~~(chamfer2D) | 651.510 |
-
-对比命令行：
-
-```bash
-cd assets/tests
-python chamferdis_speed_test.py
-```
-
-
-Test computer and System:
-- Desktop setting: i9-12900KF, GPU 3090, CUDA 11.3
-- System setting: Ubuntu 20.04, Python 3.8
-
-Output Example:
-```
-Output in my desktop with a 3090 GPU:
------- START Faiss Chamfer Distance Cal ------
-loss:  tensor(0.1710, device='cuda:0')
-Faiss Chamfer Distance Cal time: 809.593 ms
-
------- START Pytorch3d Chamfer Distance Cal ------
-Pytorch3d Chamfer Distance Cal time: 68.906 ms
-loss:  tensor(0.1710, device='cuda:0', grad_fn=<AddBackward0>)
-
------- START CUDA Chamfer Distance Cal ------
-Chamfer Distance Cal time: 1.814 ms
-loss:  tensor(0.1710, device='cuda:0', grad_fn=<AddBackward0>)
-```
-
-## Mics
-
-
-### Note for CUDA ChamferDis
-
-主要是 两个月前写的 已经看不懂了；然后问题原因是因为 总是缺0.0003的精度（精度强迫症患者）
-然后就以为是自己写错了 后面发现是因为block的这种并行化 线程大小的不同对CUDA的浮点运算会有所不同，所以导致精度差距是有一点的 如果介意的话 可以使用pytorch3d的版本（也就是速度慢4倍左右 从15ms 到 80ms）
-
-这里主要重申一遍 shared memory在这里的用法：
-1. 首先我们每个点都会分开走到 `int tid = blockIdx.x * blockDim.x + threadIdx.x;` 也就是全局索引，注意这个每个点都分开了 因为pc0每个点和pc1的临近点 和 其他的pc0点无关
-2. 然后走到每个点内部 就是__shared__ 我们首先建立了 pc1的share，但是因为共享内存有限，所以每次只保存THREADS_PER_BLOCK
-3. 保存 THREADS_PER_BLOCK 也是每个线程做的 我们在对比距离前 运行了 __syncthreads(); 确保 THREADS_PER_BLOCK 个点的 pc1 已经到了
-4. 接着 我们在 `num_elems` 这一部分的数据内进行对比，同步best
-5. 最后传给 全局这个点的 `result`
-
-需要注意的是 这种极致的并行化 会对精度产生一定的影响，但是如果你感兴趣 `#define THREADS_PER_BLOCK 256` 可以调整这个，对每个block设置不同的threads 会对精度有影响（当然 影响是 在 gt: 0.1710 但cuda计算会是 0.1711 - 0.1713之间）
-
-以下为chatgpt：
-精度差异的原因之一可能是由于在不同的线程块大小下，浮点运算的顺序发生了改变。由于浮点运算是不结合的（即(a + b) + c 可能不等于 a + (b + c)），因此改变运算的顺序可能会导致轻微的结果差异。
-
-这种类型的精度变化在GPU计算中是非常常见的，特别是在使用较大的数据集和进行大量的浮点运算时。要完全消除这种差异是非常困难的，因为即使是非常微小的实现细节变化（例如改变线程块大小、更改循环的结构、甚至是不同的GPU硬件或不同的CUDA版本）都可能导致浮点运算顺序的微小变化。
-
-如果需要确保结果的一致性，可以考虑以下方法：
-
-1. 固定线程块大小：选择一个固定的线程块大小，并始终使用它。
-
-2. 双精度浮点数（Double Precision）：使用double类型代替float，可以提高精度，但代价是更高的内存使用和可能的性能下降。
-
-3. 数值稳定的算法：尽量使用数值稳定的算法，尽管这在GPU上实现起来可能比较复杂且效率较低。
-
-4. 减少并行化程度：通过减少并行化程度来减少由于不同线程执行顺序引起的差异，但这通常会牺牲性能。
-
-
-复制代码部分如下：
-```cpp
-
-for (int i = 0; i < pc1_n; i += THREADS_PER_BLOCK) {
-	// Copy a block of pc1 to shared memory
-	int pc1_idx = i + threadIdx.x;
-	if (pc1_idx < pc1_n) {
-		shared_pc1[threadIdx.x * 3 + 0] = pc1_xyz[pc1_idx * 3 + 0];
-		shared_pc1[threadIdx.x * 3 + 1] = pc1_xyz[pc1_idx * 3 + 1];
-		shared_pc1[threadIdx.x * 3 + 2] = pc1_xyz[pc1_idx * 3 + 2];
-	}
-
-	__syncthreads();
-
-	// Compute the distance between pc0[tid] and the points in shared_pc1
-	// NOTE(Qingwen):  since after two months I forgot what I did here, I write some notes for future me
-	// 0. One reason for the difference in precision may be due to the changing order of floating point operations at different thread block sizes.
-	//    But I think it's fine we lose 0.0001 precision for speed up cal time 4x
-	// 1. since we use shared to store pc1, here Every BLOCK will have new shared_pc1 start from 0
-	// 2. we use THREADS_PER_BLOCK to loop pc1, so we need to check if the last block is not full
-	// 3. Based on the CUDA document, the __syncthreads() is not necessary here, but we keep it for safety
-	// 4. After running once, we go for next block of pc1, and find the best in that batch
-	
-	int num_elems = min(THREADS_PER_BLOCK, pc1_n - i);
-	for (int j = 0; j < num_elems; j++) {
-		float x1 = shared_pc1[j * 3 + 0];
-		float y1 = shared_pc1[j * 3 + 1];
-		float z1 = shared_pc1[j * 3 + 2];
-		float d = (x1 - x0) * (x1 - x0) + (y1 - y0) * (y1 - y0) + (z1 - z0) * (z1 - z0);
-		if (d < best) {
-			best = d;
-			best_i = j + i;
-		}
-	}
-	__syncthreads();
-}
-```
-
-## Other issues
-In cluster when build cuda things, you may occur problem:
-- `gcc: error trying to exec 'cc1plus': execvp: No such file or directory`, 
-  	Main reason is gcc and g++ version problem in cluster, you can try to install inside `conda` to solve that, with: `mamba install -c conda-forge gxx==9.5.0`. And the reason why I set 9.5.0 is because of the version for cuda 11.3 need inside specific version.
-	```bash
-	RuntimeError: The current installed version of g++ (13.2.0) is greater than the maximum required version by CUDA 11.3. Please make sure to use an adequate version of g++ (>=5.0.0, <11.0).
-	```
diff --git a/assets/cuda/chamfer3D/__init__.py b/assets/cuda/chamfer3D/__init__.py
deleted file mode 100644
index 3aac3a5..0000000
--- a/assets/cuda/chamfer3D/__init__.py
+++ /dev/null
@@ -1,117 +0,0 @@
-"""
-# Created: 2023-08-04 11:20
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-# 
-# This file is part of SeFlow (https://github.com/KTH-RPL/SeFlow).
-# If you find this repo helpful, please cite the respective publication as 
-# listed on the above website.
-# 
-# 
-# Description: ChamferDis speedup using CUDA
-"""
-from torch import nn
-from torch.autograd import Function
-import torch
-
-import os, time
-import chamfer3D
-BASE_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '../..' ))
-
-
-# GPU tensors only
-class ChamferDis(Function):
-    @staticmethod
-    def forward(ctx, pc0, pc1):
-        # pc0: (N,3), pc1: (M,3)
-        dis0 = torch.zeros(pc0.shape[0]).to(pc0.device).contiguous()
-        dis1 = torch.zeros(pc1.shape[0]).to(pc1.device).contiguous()
-        
-        idx0 = torch.zeros(pc0.shape[0], dtype=torch.int32).to(pc0.device).contiguous()
-        idx1 = torch.zeros(pc1.shape[0], dtype=torch.int32).to(pc1.device).contiguous()
-
-
-        chamfer3D.forward(pc0, pc1, dis0, dis1, idx0, idx1)
-        ctx.save_for_backward(pc0, pc1, idx0, idx1)
-        return dis0, dis1, idx0, idx1
-
-    @staticmethod
-    def backward(ctx, grad_dist0, grad_dist1, grad_idx0, grad_idx1):
-        pc0, pc1, idx0, idx1 = ctx.saved_tensors
-        grad_dist0 = grad_dist0.contiguous()
-        grad_dist1 = grad_dist1.contiguous()
-        device = grad_dist1.device
-
-        grad_pc0 = torch.zeros(pc0.size()).to(device).contiguous()
-        grad_pc1 = torch.zeros(pc1.size()).to(device).contiguous()
-
-        chamfer3D.backward(
-            pc0, pc1, idx0, idx1, grad_dist0, grad_dist1, grad_pc0, grad_pc1
-        )
-        return grad_pc0, grad_pc1
-    
-class nnChamferDis(nn.Module):
-    def __init__(self, truncate_dist=True):
-        super(nnChamferDis, self).__init__()
-        self.truncate_dist = truncate_dist
-
-    def forward(self, input0, input1, truncate_dist=-1):
-        input0 = input0.contiguous()
-        input1 = input1.contiguous()
-        dist0, dist1, _, _ = ChamferDis.apply(input0, input1)
-
-        if truncate_dist<=0:
-            return torch.mean(dist0) + torch.mean(dist1)
-
-        valid_mask0 = (dist0 <= truncate_dist)
-        valid_mask1 = (dist1 <= truncate_dist)
-        truncated_sum = torch.nanmean(dist0[valid_mask0]) + torch.nanmean(dist1[valid_mask1])
-        return truncated_sum
-
-    def dis_res(self, input0, input1):
-        input0 = input0.contiguous()
-        input1 = input1.contiguous()
-        dist0, dist1, _, _ = ChamferDis.apply(input0, input1)
-        return dist0, dist1
-    
-    def truncated_dis(self, input0, input1):
-        # nsfp: truncated distance way is set >= 2 to 0 but not nanmean
-        cham_x, cham_y = self.dis_res(input0, input1)
-        cham_x[cham_x >= 2] = 0.0
-        cham_y[cham_y >= 2] = 0.0
-        return torch.mean(cham_x) + torch.mean(cham_y)
-    
-    def disid_res(self, input0, input1):
-        input0 = input0.contiguous()
-        input1 = input1.contiguous()
-        dist0, dist1, idx0, idx1 = ChamferDis.apply(input0, input1)
-        return dist0, dist1, idx0, idx1
-class NearestNeighborDis(nn.Module):
-    def __init__(self):
-        super(NearestNeighborDis, self).__init__()
-
-    def forward(self, input0, input1):
-        input0 = input0.contiguous()
-        input1 = input1.contiguous()
-        dist0, dist1, _, _ = ChamferDis.apply(input0, input1)
-
-        return torch.mean(dist0[dist0 <= 2])
-    
-if __name__ == "__main__":
-    import numpy as np
-    pc0 = np.load(f'{BASE_DIR}/assets/tests/test_pc0.npy')
-    pc1 = np.load(f'{BASE_DIR}/assets/tests/test_pc1.npy')
-    print('0: {:.3f}MB'.format(torch.cuda.memory_allocated()/1024**2))
-    pc0 = torch.from_numpy(pc0[...,:3]).float().cuda().contiguous()
-    pc1 = torch.from_numpy(pc1[...,:3]).float().cuda().contiguous()
-    pc0.requires_grad = True
-    pc1.requires_grad = True
-    print(pc0.shape, "demo data: ", pc0[0])
-    print(pc1.shape, "demo data: ", pc1[0])
-    print('1: {:.3f}MB'.format(torch.cuda.memory_allocated()/1024**2))
-
-    start_time = time.time()
-    loss = nnChamferDis(truncate_dist=False)(pc0, pc1)
-    loss.backward()
-    print("loss: ", loss)
-    print(f"Chamfer Distance Cal time: {(time.time() - start_time)*1000:.3f} ms")
\ No newline at end of file
diff --git a/assets/cuda/chamfer3D/chamfer3D.cu b/assets/cuda/chamfer3D/chamfer3D.cu
deleted file mode 100644
index 72af47d..0000000
--- a/assets/cuda/chamfer3D/chamfer3D.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (C) 2022-now, RPL, KTH Royal Institute of Technology
- * @Author: Qingwen Zhang (https://kin-zhang.github.io/)
- * @Date: 2023-08-03 16:55
- * @Description: Chamfer distance calculation between two point clouds with CUDA
- * This file is part of SeFlow (https://github.com/KTH-RPL/SeFlow).
- * If you find this repo helpful, please cite the respective publication as 
- * listed on the above website.
-
-
- * Reference: Modified from SCOOP chamfer3D [https://github.com/itailang/SCOOP]
- * faster 2x than the original version
-*/
-
-#include <stdio.h>
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <vector>
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-#define THREADS_PER_BLOCK 256
-
-
-__global__ void NmDistanceKernel(const int pc0_n, const float *pc0_xyz, const int pc1_n, const float *pc1_xyz, float *result, int *result_i){
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    
-    if (tid >= pc0_n) return;
-
-    float x0 = pc0_xyz[tid * 3 + 0];
-    float y0 = pc0_xyz[tid * 3 + 1];
-    float z0 = pc0_xyz[tid * 3 + 2];
-
-    __shared__ float shared_pc1[THREADS_PER_BLOCK * 3];
-
-    int best_i = -1;
-    float best = 1e20;
-
-    for (int i = 0; i < pc1_n; i += THREADS_PER_BLOCK) {
-        // Copy a block of pc1 to shared memory
-        int pc1_idx = i + threadIdx.x;
-        if (pc1_idx < pc1_n) {
-            shared_pc1[threadIdx.x * 3 + 0] = pc1_xyz[pc1_idx * 3 + 0];
-            shared_pc1[threadIdx.x * 3 + 1] = pc1_xyz[pc1_idx * 3 + 1];
-            shared_pc1[threadIdx.x * 3 + 2] = pc1_xyz[pc1_idx * 3 + 2];
-        }
-
-        __syncthreads();
-
-        // Compute the distance between pc0[tid] and the points in shared_pc1
-        int num_elems = min(THREADS_PER_BLOCK, pc1_n - i);
-        for (int j = 0; j < num_elems; j++) {
-            float x1 = shared_pc1[j * 3 + 0];
-            float y1 = shared_pc1[j * 3 + 1];
-            float z1 = shared_pc1[j * 3 + 2];
-            float d = (x1 - x0) * (x1 - x0) + (y1 - y0) * (y1 - y0) + (z1 - z0) * (z1 - z0);
-            if (d < best) {
-                best = d;
-                best_i = j + i;
-            }
-        }
-
-        __syncthreads();
-    }
-
-    // done with this thread in tid in pc_0, save the result to global memory
-	atomicExch(&result[tid], best);
-	atomicExch(&result_i[tid], best_i);
-}
-
-int chamfer_cuda_forward(const at::Tensor &pc0, const at::Tensor &pc1, at::Tensor &dist0, at::Tensor &dist1, at::Tensor &idx0, at::Tensor &idx1)
-{
-	at::cuda::CUDAGuard device_guard(pc0.device());
-	cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-	const int pc0_n = pc0.size(0);
-	const int pc1_n = pc1.size(0);
-
-	const int col_blocks_pc0 = DIVUP(pc0_n, THREADS_PER_BLOCK);
-	dim3 blocks_pc0(col_blocks_pc0);
-	const int col_blocks_pc1 = DIVUP(pc1_n, THREADS_PER_BLOCK);
-	dim3 blocks_pc1(col_blocks_pc1);
-	dim3 threads(THREADS_PER_BLOCK);
-	
-	NmDistanceKernel<<<blocks_pc0, threads, 0, stream>>>(pc0_n, pc0.data_ptr<float>(), pc1_n, pc1.data_ptr<float>(), dist0.data_ptr<float>(), idx0.data_ptr<int>());
-	NmDistanceKernel<<<blocks_pc1, threads, 0, stream>>>(pc1_n, pc1.data_ptr<float>(), pc0_n, pc0.data_ptr<float>(), dist1.data_ptr<float>(), idx1.data_ptr<int>());
-	
-	AT_CUDA_CHECK(cudaGetLastError());
-
-	return 1;
-}
-
-__global__ void NmDistanceGradKernel(const int pc0_n, const float *pc0_xyz, const int pc1_n, const float *pc1_xyz, 
-                                     const float *grad_dist0, const int *idx0, float *grad_pc0, float *grad_pc1)
-{
-	CUDA_1D_KERNEL_LOOP(j0, pc0_n){
-        float x0 = pc0_xyz[j0 * 3 + 0];
-        float y0 = pc0_xyz[j0 * 3 + 1];
-        float z0 = pc0_xyz[j0 * 3 + 2];
-
-        int j1 = idx0[j0];
-        float x1 = pc1_xyz[j1 * 3 + 0];
-        float y1 = pc1_xyz[j1 * 3 + 1];
-        float z1 = pc1_xyz[j1 * 3 + 2];
-
-        float g = grad_dist0[j0] * 2;
-
-        atomicAdd(&grad_pc0[j0 * 3 + 0], g * (x0 - x1));
-        atomicAdd(&grad_pc0[j0 * 3 + 1], g * (y0 - y1));
-        atomicAdd(&grad_pc0[j0 * 3 + 2], g * (z0 - z1));
-        
-        atomicAdd(&grad_pc1[j1 * 3 + 0], - (g * (x0 - x1)));
-        atomicAdd(&grad_pc1[j1 * 3 + 1], - (g * (y0 - y1)));
-        atomicAdd(&grad_pc1[j1 * 3 + 2], - (g * (z0 - z1)));
-    }
-}
-int chamfer_cuda_backward(const at::Tensor &pc0, const at::Tensor &pc1, 
-                          const at::Tensor &idx0, const at::Tensor &idx1,
-                          at::Tensor &grad_dist0, at::Tensor &grad_dist1, 
-                          at::Tensor &grad_pc0, at::Tensor &grad_pc1)
-{
-	at::cuda::CUDAGuard device_guard(pc0.device());
-	cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-	const int pc0_n = pc0.size(0);
-	const int pc1_n = pc1.size(0);
-
-	const int col_blocks_pc0 = DIVUP(pc0_n, THREADS_PER_BLOCK);
-	dim3 blocks_pc0(col_blocks_pc0);
-	const int col_blocks_pc1 = DIVUP(pc1_n, THREADS_PER_BLOCK);
-	dim3 blocks_pc1(col_blocks_pc1);
-	dim3 threads(THREADS_PER_BLOCK);
-	
-	NmDistanceGradKernel<<<blocks_pc0, threads, 0, stream>>>(pc0_n, pc0.data_ptr<float>(), pc1_n, pc1.data_ptr<float>(), grad_dist0.data_ptr<float>(), idx0.data_ptr<int>(), grad_pc0.data_ptr<float>(), grad_pc1.data_ptr<float>());
-	NmDistanceGradKernel<<<blocks_pc1, threads, 0, stream>>>(pc1_n, pc1.data_ptr<float>(), pc0_n, pc0.data_ptr<float>(), grad_dist1.data_ptr<float>(), idx1.data_ptr<int>(), grad_pc1.data_ptr<float>(), grad_pc0.data_ptr<float>());
-	
-	AT_CUDA_CHECK(cudaGetLastError());
-
-	return 1;
-}
\ No newline at end of file
diff --git a/assets/cuda/chamfer3D/chamfer3D_cuda.cpp b/assets/cuda/chamfer3D/chamfer3D_cuda.cpp
deleted file mode 100644
index 7a89216..0000000
--- a/assets/cuda/chamfer3D/chamfer3D_cuda.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2022-now, RPL, KTH Royal Institute of Technology
- * @Author: Qingwen Zhang (https://kin-zhang.github.io/)
- * @Date: 2023-08-03 16:55
- * @Description: Chamfer distance calculation between two point clouds with CUDA
- * This file is part of SeFlow (https://github.com/KTH-RPL/SeFlow).
- * If you find this repo helpful, please cite the respective publication as 
- * listed on the above website.
-
-
- * Reference: Modified from SCOOP chamfer3D [https://github.com/itailang/SCOOP]
- * faster 2x than the original version
-*/
-
-#include <torch/torch.h>
-#include <vector>
-
-int chamfer_cuda_forward(
-    const at::Tensor &pc0, 
-    const at::Tensor &pc1, 
-    at::Tensor &dist0, 
-    at::Tensor &dist1, 
-    at::Tensor &idx0, 
-    at::Tensor &idx1);
-    
-int chamfer_cuda_backward(
-  const at::Tensor &pc0, const at::Tensor &pc1, 
-  const at::Tensor &idx0, const at::Tensor &idx1,
-  at::Tensor &grad_dist0, at::Tensor &grad_dist1, 
-  at::Tensor &grad_pc0, at::Tensor &grad_pc1);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &chamfer_cuda_forward, "Chamfer Distance (CUDA)");
-  m.def("backward", &chamfer_cuda_backward, "Chamfer Distance (CUDA) Backward Grad");
-}
\ No newline at end of file
diff --git a/assets/cuda/chamfer3D/setup.py b/assets/cuda/chamfer3D/setup.py
deleted file mode 100755
index 2c16070..0000000
--- a/assets/cuda/chamfer3D/setup.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-setup(
-    name='chamfer3D',
-    ext_modules=[
-        CUDAExtension('chamfer3D', [
-            "/".join(__file__.split('/')[:-1] + ['chamfer3D_cuda.cpp']), # must named as xxx_cuda.cpp
-            "/".join(__file__.split('/')[:-1] + ['chamfer3D.cu']),
-        ]),
-    ],
-    cmdclass={
-        'build_ext': BuildExtension
-    },
-    version='1.0.1')
diff --git a/assets/cuda/mmcv/README.md b/assets/cuda/mmcv/README.md
deleted file mode 100644
index 5fa6799..0000000
--- a/assets/cuda/mmcv/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-mmcv
----
-
-This file extracted functions we need used in mmcv to release the dependency of mmcv-full and faster the installation process.
\ No newline at end of file
diff --git a/assets/cuda/mmcv/__init__.py b/assets/cuda/mmcv/__init__.py
deleted file mode 100755
index dca7091..0000000
--- a/assets/cuda/mmcv/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .voxelize import Voxelization, voxelization
-from .scatter_points import DynamicScatter, dynamic_scatter
-
-__all__ = [
-    'Voxelization', 'voxelization',
-    'dynamic_scatter', 'DynamicScatter'
-]
diff --git a/assets/cuda/mmcv/common_cuda_helper.hpp b/assets/cuda/mmcv/common_cuda_helper.hpp
deleted file mode 100644
index b12aa9a..0000000
--- a/assets/cuda/mmcv/common_cuda_helper.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef COMMON_CUDA_HELPER
-#define COMMON_CUDA_HELPER
-
-#include <cuda.h>
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
-       i += blockDim.x * gridDim.x)                                 \
-    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
-         j += blockDim.y * gridDim.y)
-
-#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
-  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
-    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
-
-#define THREADS_PER_BLOCK 512
-
-inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
-  int optimal_block_num = (N + num_threads - 1) / num_threads;
-  int max_block_num = 4096;
-  return min(optimal_block_num, max_block_num);
-}
-
-template <typename T>
-__device__ T bilinear_interpolate(const T* input, const int height,
-                                  const int width, T y, T x,
-                                  const int index /* index for debug only*/) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  int y_low = (int)y;
-  int x_low = (int)x;
-  int y_high;
-  int x_high;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  // do bilinear interpolation
-  T v1 = input[y_low * width + x_low];
-  T v2 = input[y_low * width + x_high];
-  T v3 = input[y_high * width + x_low];
-  T v4 = input[y_high * width + x_high];
-  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  return val;
-}
-
-template <typename T>
-__device__ void bilinear_interpolate_gradient(
-    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
-    int& x_low, int& x_high, int& y_low, int& y_high,
-    const int index /* index for debug only*/) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    // empty
-    w1 = w2 = w3 = w4 = 0.;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  y_low = (int)y;
-  x_low = (int)x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  // reference in forward
-  // T v1 = input[y_low * width + x_low];
-  // T v2 = input[y_low * width + x_high];
-  // T v3 = input[y_high * width + x_low];
-  // T v4 = input[y_high * width + x_high];
-  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  return;
-}
-#endif  // COMMON_CUDA_HELPER
diff --git a/assets/cuda/mmcv/cudabind.cpp b/assets/cuda/mmcv/cudabind.cpp
deleted file mode 100644
index 6dd27f1..0000000
--- a/assets/cuda/mmcv/cudabind.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
-
-void DynamicVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &coors,
-    const std::vector<float> voxel_size, const std::vector<float> coors_range,
-    const int NDim = 3);
-
-void dynamic_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &coors,
-                                   const std::vector<float> voxel_size,
-                                   const std::vector<float> coors_range,
-                                   const int NDim) {
-  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
-                                           coors_range, NDim);
-};
-
-void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
-                                   const std::vector<float> voxel_size,
-                                   const std::vector<float> coors_range,
-                                   const int NDim);
-
-
-int HardVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3);
-    
-int hard_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &voxels,
-                               at::Tensor &coors,
-                               at::Tensor &num_points_per_voxel,
-                               const std::vector<float> voxel_size,
-                               const std::vector<float> coors_range,
-                               const int max_points, const int max_voxels,
-                               const int NDim) {
-  return HardVoxelizeForwardCUDAKernelLauncher(
-      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
-      max_points, max_voxels, NDim);
-};
-
-int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
-                               at::Tensor &coors,
-                               at::Tensor &num_points_per_voxel,
-                               const std::vector<float> voxel_size,
-                               const std::vector<float> coors_range,
-                               const int max_points, const int max_voxels,
-                               const int NDim);
-
-int nondeterministic_hard_voxelize_forward_impl(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim);
-
-REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
-                     hard_voxelize_forward_cuda);
-REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
-                     dynamic_voxelize_forward_cuda);
-
-
-std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
-    const at::Tensor &feats, const at::Tensor &coors,
-    const reduce_t reduce_type);
-
-
-std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
-    const torch::Tensor &feats, const torch::Tensor &coors,
-    const reduce_t reduce_type) {
-  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
-                                                      reduce_type);
-};
-
-void DynamicPointToVoxelBackwardCUDAKernelLauncher(
-    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
-    const at::Tensor &feats, const at::Tensor &reduced_feats,
-    const at::Tensor &coors_map, const at::Tensor &reduce_count,
-    const reduce_t reduce_type);
-
-void dynamic_point_to_voxel_backward_cuda(
-    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
-    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
-    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
-    const reduce_t reduce_type) {
-  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
-                                                feats, reduced_feats, coors_idx,
-                                                reduce_count, reduce_type);
-};
-
-std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
-    const torch::Tensor &feats, const torch::Tensor &coors,
-    const reduce_t reduce_type);
-
-void dynamic_point_to_voxel_backward_impl(
-    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
-    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
-    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
-    const reduce_t reduce_type);
-
-REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
-                     dynamic_point_to_voxel_forward_cuda);
-REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
-                     dynamic_point_to_voxel_backward_cuda);
\ No newline at end of file
diff --git a/assets/cuda/mmcv/pybind.cpp b/assets/cuda/mmcv/pybind.cpp
deleted file mode 100644
index 1b0823b..0000000
--- a/assets/cuda/mmcv/pybind.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include <torch/extension.h>
-
-#include "pytorch_cpp_helper.hpp"
-
-
-std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
-    const torch::Tensor &feats, const torch::Tensor &coors,
-    const std::string &reduce_type);
-
-void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
-                                     const torch::Tensor &grad_reduced_feats,
-                                     const torch::Tensor &feats,
-                                     const torch::Tensor &reduced_feats,
-                                     const torch::Tensor &coors_idx,
-                                     const torch::Tensor &reduce_count,
-                                     const std::string &reduce_type);
-
-void dynamic_voxelize_forward(const at::Tensor &points,
-                              const at::Tensor &voxel_size,
-                              const at::Tensor &coors_range, at::Tensor &coors,
-                              const int NDim);
-
-void hard_voxelize_forward(const at::Tensor &points,
-                           const at::Tensor &voxel_size,
-                           const at::Tensor &coors_range, at::Tensor &voxels,
-                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
-                           at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim,
-                           const bool deterministic);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward,
-        "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"),
-        py::arg("reduce_type"));
-  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward,
-        "dynamic_point_to_voxel_backward", py::arg("grad_feats"),
-        py::arg("grad_reduced_feats"), py::arg("feats"),
-        py::arg("reduced_feats"), py::arg("coors_idx"), py::arg("reduce_count"),
-        py::arg("reduce_type"));
-  m.def("hard_voxelize_forward", &hard_voxelize_forward,
-        "hard_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
-        py::arg("coors_range"), py::arg("voxels"), py::arg("coors"),
-        py::arg("num_points_per_voxel"), py::arg("voxel_num"),
-        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"),
-        py::arg("deterministic"));
-  m.def("dynamic_voxelize_forward", &dynamic_voxelize_forward,
-        "dynamic_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
-        py::arg("coors_range"), py::arg("coors"), py::arg("NDim"));
-}
\ No newline at end of file
diff --git a/assets/cuda/mmcv/pytorch_cpp_helper.hpp b/assets/cuda/mmcv/pytorch_cpp_helper.hpp
deleted file mode 100644
index f68e874..0000000
--- a/assets/cuda/mmcv/pytorch_cpp_helper.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef PYTORCH_CPP_HELPER
-#define PYTORCH_CPP_HELPER
-#include <torch/types.h>
-
-#include <vector>
-
-using namespace at;
-
-#define CHECK_CUDA(x) \
-  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_MLU(x) \
-  TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
-#define CHECK_CPU(x) \
-  TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_CUDA_INPUT(x) \
-  CHECK_CUDA(x);            \
-  CHECK_CONTIGUOUS(x)
-#define CHECK_MLU_INPUT(x) \
-  CHECK_MLU(x);            \
-  CHECK_CONTIGUOUS(x)
-#define CHECK_CPU_INPUT(x) \
-  CHECK_CPU(x);            \
-  CHECK_CONTIGUOUS(x)
-
-#endif  // PYTORCH_CPP_HELPER
diff --git a/assets/cuda/mmcv/pytorch_cuda_helper.hpp b/assets/cuda/mmcv/pytorch_cuda_helper.hpp
deleted file mode 100644
index 52e5126..0000000
--- a/assets/cuda/mmcv/pytorch_cuda_helper.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef PYTORCH_CUDA_HELPER
-#define PYTORCH_CUDA_HELPER
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#include <THC/THCAtomics.cuh>
-
-#include "common_cuda_helper.hpp"
-
-using at::Half;
-using at::Tensor;
-using phalf = at::Half;
-
-#define __PHALF(x) (x)
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-#endif  // PYTORCH_CUDA_HELPER
diff --git a/assets/cuda/mmcv/pytorch_device_registry.hpp b/assets/cuda/mmcv/pytorch_device_registry.hpp
deleted file mode 100644
index 2a32b72..0000000
--- a/assets/cuda/mmcv/pytorch_device_registry.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-#ifndef PYTORCH_DEVICE_REGISTRY_H
-#define PYTORCH_DEVICE_REGISTRY_H
-
-// Using <torch/extension.h> is recommended in the official documentation in
-// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
-// However, we use <torch/types.h> for compatibility with CUDA 9.0
-// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
-#include <torch/types.h>
-
-#include <cassert>
-#include <functional>
-#include <map>
-#include <type_traits>
-
-inline std::string GetDeviceStr(const at::Device& device) {
-  std::string str = DeviceTypeName(device.type(), true);
-  if (device.has_index()) {
-    str.push_back(':');
-    str.append(std::to_string(device.index()));
-  }
-  return str;
-}
-
-// Registry
-template <typename F, F f>
-class DeviceRegistry;
-
-template <typename Ret, typename... Args, Ret (*f)(Args...)>
-class DeviceRegistry<Ret (*)(Args...), f> {
- public:
-  using FunctionType = Ret (*)(Args...);
-  static const int MAX_DEVICE_TYPES =
-      int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
-
-  void Register(at::DeviceType device, FunctionType function) {
-    funcs_[int8_t(device)] = function;
-  }
-
-  FunctionType Find(at::DeviceType device) const {
-    return funcs_[int8_t(device)];
-  }
-
-  static DeviceRegistry& instance() {
-    static DeviceRegistry inst;
-    return inst;
-  }
-
- private:
-  DeviceRegistry() {
-    for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) {
-      funcs_[i] = nullptr;
-    }
-  };
-  FunctionType funcs_[MAX_DEVICE_TYPES];
-};
-
-// get device of first tensor param
-
-template <typename T, typename... Args,
-          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
-                           bool> = true>
-at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
-  return std::forward<T>(t).device();
-}
-template <typename T, typename... Args,
-          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
-                           bool> = true>
-at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
-  return GetFirstTensorDevice(std::forward<Args>(args)...);
-}
-
-// check device consistency
-
-inline std::pair<int, at::Device> CheckDeviceConsistency(
-    const at::Device& device, int index) {
-  return {index, device};
-}
-
-template <typename T, typename... Args,
-          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
-                           bool> = true>
-std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
-                                                  int index, T&& t,
-                                                  Args&&... args);
-
-template <typename T, typename... Args,
-          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
-                           bool> = true>
-std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
-                                                  int index, T&& t,
-                                                  Args&&... args) {
-  auto new_device = std::forward<T>(t).device();
-  if (new_device.type() != device.type() ||
-      new_device.index() != device.index()) {
-    return {index, new_device};
-  }
-  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
-}
-
-template <
-    typename T, typename... Args,
-    std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
-std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
-                                                  int index, T&& t,
-                                                  Args&&... args) {
-  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
-}
-
-// dispatch
-
-template <typename R, typename... Args>
-auto Dispatch(const R& registry, const char* name, Args&&... args) {
-  auto device = GetFirstTensorDevice(std::forward<Args>(args)...);
-  auto inconsist =
-      CheckDeviceConsistency(device, 0, std::forward<Args>(args)...);
-  TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, ": at param ",
-              inconsist.first,
-              ", inconsistent device: ", GetDeviceStr(inconsist.second).c_str(),
-              " vs ", GetDeviceStr(device).c_str(), "\n")
-  auto f_ptr = registry.Find(device.type());
-  TORCH_CHECK(f_ptr != nullptr, name, ": implementation for device ",
-              GetDeviceStr(device).c_str(), " not found.\n")
-  return f_ptr(std::forward<Args>(args)...);
-}
-
-// helper macro
-
-#define DEVICE_REGISTRY(key) DeviceRegistry<decltype(&(key)), key>::instance()
-
-#define REGISTER_DEVICE_IMPL(key, device, value)           \
-  struct key##_##device##_registerer {                     \
-    key##_##device##_registerer() {                        \
-      DEVICE_REGISTRY(key).Register(at::k##device, value); \
-    }                                                      \
-  };                                                       \
-  static key##_##device##_registerer _##key##_##device##_registerer;
-
-#define DISPATCH_DEVICE_IMPL(key, ...) \
-  Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__)
-
-#endif  // PYTORCH_DEVICE_REGISTRY
diff --git a/assets/cuda/mmcv/scatter_points.cpp b/assets/cuda/mmcv/scatter_points.cpp
deleted file mode 100644
index 0de8ebf..0000000
--- a/assets/cuda/mmcv/scatter_points.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
-
-std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
-    const torch::Tensor &feats, const torch::Tensor &coors,
-    const reduce_t reduce_type) {
-  return DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, feats, coors,
-                              reduce_type);
-}
-
-void dynamic_point_to_voxel_backward_impl(
-    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
-    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
-    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
-    const reduce_t reduce_type) {
-  DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, grad_feats,
-                       grad_reduced_feats, feats, reduced_feats, coors_idx,
-                       reduce_count, reduce_type);
-}
-
-inline reduce_t convert_reduce_type(const std::string &reduce_type) {
-  if (reduce_type == "max")
-    return reduce_t::MAX;
-  else if (reduce_type == "sum")
-    return reduce_t::SUM;
-  else if (reduce_type == "mean")
-    return reduce_t::MEAN;
-  else
-    TORCH_CHECK(false, "do not support reduce type " + reduce_type)
-  return reduce_t::SUM;
-}
-
-std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
-    const torch::Tensor &feats, const torch::Tensor &coors,
-    const std::string &reduce_type) {
-  return dynamic_point_to_voxel_forward_impl(feats, coors,
-                                             convert_reduce_type(reduce_type));
-}
-
-void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
-                                     const torch::Tensor &grad_reduced_feats,
-                                     const torch::Tensor &feats,
-                                     const torch::Tensor &reduced_feats,
-                                     const torch::Tensor &coors_idx,
-                                     const torch::Tensor &reduce_count,
-                                     const std::string &reduce_type) {
-  dynamic_point_to_voxel_backward_impl(grad_feats, grad_reduced_feats, feats,
-                                       reduced_feats, coors_idx, reduce_count,
-                                       convert_reduce_type(reduce_type));
-}
diff --git a/assets/cuda/mmcv/scatter_points.py b/assets/cuda/mmcv/scatter_points.py
deleted file mode 100644
index 258ca97..0000000
--- a/assets/cuda/mmcv/scatter_points.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, List, Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.autograd import Function
-
-# from utils import ext_loader
-import importlib
-def load_ext(name, funcs):
-    ext = importlib.import_module('mmcv.' + name)
-    for fun in funcs:
-        assert hasattr(ext, fun), f'{fun} miss in module {name}'
-    return ext
-
-ext_module = load_ext(
-    '_ext',
-    ['dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward'])
-
-
-class _DynamicScatter(Function):
-
-    @staticmethod
-    def forward(ctx: Any,
-                feats: torch.Tensor,
-                coors: torch.Tensor,
-                reduce_type: str = 'max') -> Tuple[torch.Tensor, torch.Tensor]:
-        """convert kitti points(N, >=3) to voxels.
-
-        Args:
-            feats (torch.Tensor): [N, C]. Points features to be reduced
-                into voxels.
-            coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates
-                (specifically multi-dim voxel index) of each points.
-            reduce_type (str, optional): Reduce op. support 'max', 'sum' and
-                'mean'. Default: 'max'.
-
-        Returns:
-            tuple[torch.Tensor]: A tuple contains two elements. The first one
-            is the voxel features with shape [M, C] which are respectively
-            reduced from input features that share the same voxel coordinates.
-            The second is voxel coordinates with shape [M, ndim].
-        """
-        results = ext_module.dynamic_point_to_voxel_forward(
-            feats, coors, reduce_type)
-        (voxel_feats, voxel_coors, point2voxel_map,
-         voxel_points_count) = results
-        ctx.reduce_type = reduce_type
-        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
-                              voxel_points_count)
-        ctx.mark_non_differentiable(voxel_coors)
-        return voxel_feats, voxel_coors
-
-    @staticmethod
-    def backward(ctx: Any,
-                 grad_voxel_feats: torch.Tensor,
-                 grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:
-        (feats, voxel_feats, point2voxel_map,
-         voxel_points_count) = ctx.saved_tensors
-        grad_feats = torch.zeros_like(feats)
-        # TODO: whether to use index put or use cuda_backward
-        # To use index put, need point to voxel index
-        ext_module.dynamic_point_to_voxel_backward(
-            grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats,
-            point2voxel_map, voxel_points_count, ctx.reduce_type)
-        return grad_feats, None, None
-
-
-dynamic_scatter = _DynamicScatter.apply
-
-
-class DynamicScatter(nn.Module):
-    """Scatters points into voxels, used in the voxel encoder with dynamic
-    voxelization.
-
-    Note:
-        The CPU and GPU implementation get the same output, but have numerical
-        difference after summation and division (e.g., 5e-7).
-
-    Args:
-        voxel_size (list): list [x, y, z] size of three dimension.
-        point_cloud_range (list): The coordinate range of points, [x_min,
-            y_min, z_min, x_max, y_max, z_max].
-        average_points (bool): whether to use avg pooling to scatter points
-            into voxel.
-    """
-
-    def __init__(self, voxel_size: List, point_cloud_range: List,
-                 average_points: bool):
-        super().__init__()
-
-        self.voxel_size = voxel_size
-        self.point_cloud_range = point_cloud_range
-        self.average_points = average_points
-
-    def forward_single(
-            self, points: torch.Tensor,
-            coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Scatters points into voxels.
-
-        Args:
-            points (torch.Tensor): Points to be reduced into voxels.
-            coors (torch.Tensor): Corresponding voxel coordinates (specifically
-                multi-dim voxel index) of each points.
-
-        Returns:
-            tuple[torch.Tensor]: A tuple contains two elements. The first one
-            is the voxel features with shape [M, C] which are respectively
-            reduced from input features that share the same voxel coordinates.
-            The second is voxel coordinates with shape [M, ndim].
-        """
-        reduce = 'mean' if self.average_points else 'max'
-        return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
-
-    def forward(self, points: torch.Tensor,
-                coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Scatters points/features into voxels.
-
-        Args:
-            points (torch.Tensor): Points to be reduced into voxels.
-            coors (torch.Tensor): Corresponding voxel coordinates (specifically
-                multi-dim voxel index) of each points.
-
-        Returns:
-            tuple[torch.Tensor]: A tuple contains two elements. The first one
-            is the voxel features with shape [M, C] which are respectively
-            reduced from input features that share the same voxel coordinates.
-            The second is voxel coordinates with shape [M, ndim].
-        """
-        if coors.size(-1) == 3:
-            return self.forward_single(points, coors)
-        else:
-            batch_size = coors[-1, 0] + 1
-            voxels, voxel_coors = [], []
-            for i in range(batch_size):
-                inds = torch.where(coors[:, 0] == i)
-                voxel, voxel_coor = self.forward_single(
-                    points[inds], coors[inds][:, 1:])
-                coor_pad = F.pad(voxel_coor, (1, 0), mode='constant', value=i)
-                voxel_coors.append(coor_pad)
-                voxels.append(voxel)
-            features = torch.cat(voxels, dim=0)
-            feature_coors = torch.cat(voxel_coors, dim=0)
-
-            return features, feature_coors
-
-    def __repr__(self):
-        s = self.__class__.__name__ + '('
-        s += 'voxel_size=' + str(self.voxel_size)
-        s += ', point_cloud_range=' + str(self.point_cloud_range)
-        s += ', average_points=' + str(self.average_points)
-        s += ')'
-        return s
diff --git a/assets/cuda/mmcv/scatter_points_cuda.cu b/assets/cuda/mmcv/scatter_points_cuda.cu
deleted file mode 100644
index cbc4465..0000000
--- a/assets/cuda/mmcv/scatter_points_cuda.cu
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-#include <stdio.h>
-#include <stdlib.h>
-#include <torch/types.h>
-
-#include "pytorch_cuda_helper.hpp"
-#include "scatter_points_cuda_kernel.cuh"
-
-std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
-    const at::Tensor &feats, const at::Tensor &coors,
-    const reduce_t reduce_type) {
-  const int num_input = feats.size(0);
-  const int num_feats = feats.size(1);
-
-  if (num_input == 0)
-    return {feats.clone().detach(), coors.clone().detach(),
-            coors.new_empty({0}, torch::kInt32),
-            coors.new_empty({0}, torch::kInt32)};
-
-  at::Tensor out_coors;
-  at::Tensor coors_map;
-  at::Tensor reduce_count;
-
-  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
-
-  std::tie(out_coors, coors_map, reduce_count) =
-      at::unique_dim(coors_clean, 0, true, true, true);
-
-  if (out_coors[0][0].lt(0).item<bool>()) {
-    // the first element of out_coors (-1,-1,-1) and should be removed
-    out_coors = out_coors.slice(0, 1);
-    reduce_count = reduce_count.slice(0, 1);
-    coors_map = coors_map - 1;
-  }
-
-  coors_map = coors_map.to(torch::kInt32);
-  reduce_count = reduce_count.to(torch::kInt32);
-
-  auto reduced_feats =
-      at::empty({out_coors.size(0), num_feats}, feats.options());
-
-  at::cuda::CUDAGuard device_guard(feats.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  AT_DISPATCH_FLOATING_TYPES(
-      feats.scalar_type(), "feats_reduce_kernel", ([&] {
-        if (reduce_type == reduce_t::MAX)
-          reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
-        else
-          reduced_feats.fill_(static_cast<scalar_t>(0));
-
-        dim3 blocks(std::min(
-            at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
-        dim3 threads(THREADS_PER_BLOCK);
-        feats_reduce_kernel<<<blocks, threads, 0, stream>>>(
-            feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
-            reduced_feats.data_ptr<scalar_t>(), num_input, num_feats,
-            reduce_type);
-        if (reduce_type == reduce_t::MEAN)
-          reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
-      }));
-
-  AT_CUDA_CHECK(cudaGetLastError());
-
-  return {reduced_feats, out_coors, coors_map, reduce_count};
-}
-
-void DynamicPointToVoxelBackwardCUDAKernelLauncher(
-    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
-    const at::Tensor &feats, const at::Tensor &reduced_feats,
-    const at::Tensor &coors_map, const at::Tensor &reduce_count,
-    const reduce_t reduce_type) {
-  const int num_input = feats.size(0);
-  const int num_reduced = reduced_feats.size(0);
-  const int num_feats = feats.size(1);
-
-  grad_feats.fill_(0);
-  // copy voxel grad to points
-
-  if (num_input == 0 || num_reduced == 0) return;
-  at::cuda::CUDAGuard device_guard(feats.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
-    AT_DISPATCH_FLOATING_TYPES(
-        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
-        ([&] {
-          dim3 blocks(std::min(
-              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
-          dim3 threads(THREADS_PER_BLOCK);
-          add_reduce_traceback_grad_kernel<<<blocks, threads, 0, stream>>>(
-              grad_feats.data_ptr<scalar_t>(),
-              grad_reduced_feats.data_ptr<scalar_t>(),
-              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
-              num_input, num_feats, reduce_type);
-        }));
-
-    AT_CUDA_CHECK(cudaGetLastError());
-  } else {
-    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
-                                coors_map.options().dtype(torch::kInt32));
-    AT_DISPATCH_FLOATING_TYPES(
-        grad_reduced_feats.scalar_type(),
-        "max_reduce_traceback_scatter_idx_kernel", ([&] {
-          dim3 blocks(std::min(
-              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
-          dim3 threads(THREADS_PER_BLOCK);
-          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads, 0,
-                                                    stream>>>(
-              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
-              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
-              num_input, num_feats);
-        }));
-
-    AT_CUDA_CHECK(cudaGetLastError());
-
-    AT_DISPATCH_FLOATING_TYPES(
-        grad_reduced_feats.scalar_type(),
-        "max_reduce_traceback_scatter_idx_kernel", ([&] {
-          dim3 blocks(
-              std::min(at::cuda::ATenCeilDiv(num_reduced, THREADS_PER_BLOCK),
-                       maxGridDim));
-          dim3 threads(THREADS_PER_BLOCK);
-          max_reduce_scatter_grad_kernel<<<blocks, threads, 0, stream>>>(
-              grad_feats.data_ptr<scalar_t>(),
-              grad_reduced_feats.data_ptr<scalar_t>(),
-              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
-        }));
-
-    AT_CUDA_CHECK(cudaGetLastError());
-  }
-}
diff --git a/assets/cuda/mmcv/scatter_points_cuda_kernel.cuh b/assets/cuda/mmcv/scatter_points_cuda_kernel.cuh
deleted file mode 100644
index af5b9f6..0000000
--- a/assets/cuda/mmcv/scatter_points_cuda_kernel.cuh
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef SCATTER_POINTS_CUDA_KERNEL_CUH
-#define SCATTER_POINTS_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
-int const maxGridDim = 50000;
-
-__device__ __forceinline__ static void reduceMax(float *address, float val) {
-  int *address_as_i = reinterpret_cast<int *>(address);
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_i, assumed,
-                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
-  } while (assumed != old || __int_as_float(old) < val);
-}
-
-__device__ __forceinline__ static void reduceMax(double *address, double val) {
-  unsigned long long *address_as_ull =
-      reinterpret_cast<unsigned long long *>(address);
-  unsigned long long old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(
-        address_as_ull, assumed,
-        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
-  } while (assumed != old || __longlong_as_double(old) < val);
-}
-
-// get rid of meaningless warnings when compiling host code
-#ifdef MMCV_WITH_HIP
-__device__ __forceinline__ static void reduceAdd(float *address, float val) {
-  atomicAdd(address, val);
-}
-__device__ __forceinline__ static void reduceAdd(double *address, double val) {
-  atomicAdd(address, val);
-}
-#else
-#ifdef __CUDA_ARCH__
-__device__ __forceinline__ static void reduceAdd(float *address, float val) {
-#if (__CUDA_ARCH__ < 200)
-#ifdef _MSC_VER
-#pragma message( \
-    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32")
-#else
-#warning \
-    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
-#endif
-  int *address_as_i = reinterpret_cast<int *>(address);
-  int old = *address_as_i, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_i, assumed,
-                    __float_as_int(val + __int_as_float(assumed)));
-  } while (assumed != old);
-#else
-  atomicAdd(address, val);
-#endif
-}
-
-__device__ __forceinline__ static void reduceAdd(double *address, double val) {
-#if (__CUDA_ARCH__ < 600)
-#ifdef _MSC_VER
-#pragma message( \
-    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64")
-#else
-#warning \
-    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
-#endif
-  unsigned long long *address_as_ull =
-      reinterpret_cast<unsigned long long *>(address);
-  unsigned long long old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-  } while (assumed != old);
-#else
-  atomicAdd(address, val);
-#endif
-}
-#endif  // __CUDA_ARCH__
-#endif  // MMCV_WITH_HIP
-
-template <typename T>
-__global__ void feats_reduce_kernel(
-    const T *feats, const int32_t *coors_map,
-    T *reduced_feats,  // shall be 0 at initialization
-    const int num_input, const int num_feats, const reduce_t reduce_type) {
-  CUDA_1D_KERNEL_LOOP(x, num_input) {
-    int32_t reduce_to = coors_map[x];
-    if (reduce_to == -1) continue;
-
-    const T *feats_offset = feats + x * num_feats;
-    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
-    if (reduce_type == reduce_t::MAX) {
-      for (int i = 0; i < num_feats; i++) {
-        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
-      }
-    } else {
-      for (int i = 0; i < num_feats; i++) {
-        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void add_reduce_traceback_grad_kernel(
-    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
-    const int32_t *reduce_count, const int num_input, const int num_feats,
-    const reduce_t reduce_type) {
-  CUDA_1D_KERNEL_LOOP(x, num_input) {
-    int32_t reduce_to = coors_map[x];
-    if (reduce_to == -1) {
-      continue;
-    }
-
-    const int input_offset = x * num_feats;
-    T *grad_feats_offset = grad_feats + input_offset;
-    const int reduced_offset = reduce_to * num_feats;
-    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
-
-    if (reduce_type == reduce_t::SUM) {
-      for (int i = 0; i < num_feats; i++) {
-        grad_feats_offset[i] = grad_reduced_feats_offset[i];
-      }
-    } else if (reduce_type == reduce_t::MEAN) {
-      for (int i = 0; i < num_feats; i++) {
-        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
-                               static_cast<T>(reduce_count[reduce_to]);
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void max_reduce_traceback_scatter_idx_kernel(
-    const T *feats, const T *reduced_feats, int32_t *reduce_from,
-    const int32_t *coors_map, const int num_input, const int num_feats) {
-  CUDA_1D_KERNEL_LOOP(x, num_input) {
-    int32_t reduce_to = coors_map[x];
-
-    const int input_offset = x * num_feats;
-    const T *feats_offset = feats + input_offset;
-
-    if (reduce_to == -1) {
-      continue;
-    }
-
-    const int reduced_offset = reduce_to * num_feats;
-    const T *reduced_feats_offset = reduced_feats + reduced_offset;
-    int32_t *reduce_from_offset = reduce_from + reduced_offset;
-
-    for (int i = 0; i < num_feats; i++) {
-      if (feats_offset[i] == reduced_feats_offset[i]) {
-        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
-                                               const T *grad_reduced_feats,
-                                               const int32_t *reduce_from,
-                                               const int num_reduced,
-                                               const int num_feats) {
-  CUDA_1D_KERNEL_LOOP(x, num_reduced) {
-    const int reduced_offset = x * num_feats;
-    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
-    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
-
-    for (int i = 0; i < num_feats; i++) {
-      grad_feats[scatter_to_offset[i] * num_feats + i] =
-          grad_reduced_feats_offset[i];
-    }
-  }
-}
-
-#endif  // SCATTER_POINTS_CUDA_KERNEL_CUH
diff --git a/assets/cuda/mmcv/setup.py b/assets/cuda/mmcv/setup.py
deleted file mode 100644
index 53ed40b..0000000
--- a/assets/cuda/mmcv/setup.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import os
-from pkg_resources import DistributionNotFound, get_distribution, parse_version
-from setuptools import find_packages, setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-setup(
-    name='mmcv',
-    version='1.0.1',
-    ext_modules=[
-        CUDAExtension(
-            name='mmcv._ext',
-            sources=[
-                "/".join(__file__.split("/")[:-1] + ["scatter_points_cuda.cu"]),
-                "/".join(__file__.split("/")[:-1] + ["scatter_points.cpp"]),
-                "/".join(__file__.split("/")[:-1] + ["voxelization_cuda.cu"]),
-                "/".join(__file__.split("/")[:-1] + ["voxelization.cpp"]),
-                "/".join(__file__.split("/")[:-1] + ["cudabind.cpp"]),
-                "/".join(__file__.split("/")[:-1] + ["pybind.cpp"]),
-
-            ],
-            # extra_compile_args={
-            #     'cxx': ['-std=c++17'], 
-            #     'nvcc': ['-std=c++17',
-            #     '-D__CUDA_NO_HALF_OPERATORS__',
-            #     '-D__CUDA_NO_HALF_CONVERSIONS__',
-            #     '-D__CUDA_NO_HALF2_OPERATORS__',
-            #              ],}
-            ),
-    ],
-    cmdclass={'build_ext': BuildExtension},
-
-    # no change below
-    description='OpenMMLab Computer Vision Foundation',
-    keywords='computer vision',
-    packages=find_packages(),
-    include_package_data=True,
-    classifiers=[
-        'Development Status :: 4 - Beta',
-        'License :: OSI Approved :: Apache Software License',
-        'Operating System :: OS Independent',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
-        'Programming Language :: Python :: 3.10',
-        'Topic :: Utilities',
-    ],
-    url='https://github.com/open-mmlab/mmcv',
-    author='MMCV Contributors',
-    author_email='openmmlab@gmail.com',
-    python_requires='>=3.7',
-    zip_safe=False)
\ No newline at end of file
diff --git a/assets/cuda/mmcv/voxelization.cpp b/assets/cuda/mmcv/voxelization.cpp
deleted file mode 100644
index 7946be6..0000000
--- a/assets/cuda/mmcv/voxelization.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-#include "pytorch_cpp_helper.hpp"
-#include "pytorch_device_registry.hpp"
-
-int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
-                               at::Tensor &coors,
-                               at::Tensor &num_points_per_voxel,
-                               const std::vector<float> voxel_size,
-                               const std::vector<float> coors_range,
-                               const int max_points, const int max_voxels,
-                               const int NDim = 3) {
-  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
-                              num_points_per_voxel, voxel_size, coors_range,
-                              max_points, max_voxels, NDim);
-}
-
-int nondeterministic_hard_voxelize_forward_impl(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3) {
-  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
-                              points, voxels, coors, num_points_per_voxel,
-                              voxel_size, coors_range, max_points, max_voxels,
-                              NDim);
-}
-
-void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
-                                   const std::vector<float> voxel_size,
-                                   const std::vector<float> coors_range,
-                                   const int NDim = 3) {
-  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
-                       coors_range, NDim);
-}
-
-void hard_voxelize_forward(const at::Tensor &points,
-                           const at::Tensor &voxel_size,
-                           const at::Tensor &coors_range, at::Tensor &voxels,
-                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
-                           at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim = 3,
-                           const bool deterministic = true) {
-  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
-  std::vector<float> voxel_size_v(
-      voxel_size.data_ptr<float>(),
-      voxel_size.data_ptr<float>() + voxel_size.numel());
-  std::vector<float> coors_range_v(
-      coors_range.data_ptr<float>(),
-      coors_range.data_ptr<float>() + coors_range.numel());
-
-  if (deterministic) {
-    *voxel_num_data = hard_voxelize_forward_impl(
-        points, voxels, coors, num_points_per_voxel, voxel_size_v,
-        coors_range_v, max_points, max_voxels, NDim);
-  } else {
-    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
-        points, voxels, coors, num_points_per_voxel, voxel_size_v,
-        coors_range_v, max_points, max_voxels, NDim);
-  }
-}
-
-void dynamic_voxelize_forward(const at::Tensor &points,
-                              const at::Tensor &voxel_size,
-                              const at::Tensor &coors_range, at::Tensor &coors,
-                              const int NDim = 3) {
-  std::vector<float> voxel_size_v(
-      voxel_size.data_ptr<float>(),
-      voxel_size.data_ptr<float>() + voxel_size.numel());
-  std::vector<float> coors_range_v(
-      coors_range.data_ptr<float>(),
-      coors_range.data_ptr<float>() + coors_range.numel());
-  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
-                                NDim);
-}
diff --git a/assets/cuda/mmcv/voxelization_cuda.cu b/assets/cuda/mmcv/voxelization_cuda.cu
deleted file mode 100644
index f4166b7..0000000
--- a/assets/cuda/mmcv/voxelization_cuda.cu
+++ /dev/null
@@ -1,286 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "pytorch_cuda_helper.hpp"
-#include "voxelization_cuda_kernel.cuh"
-
-int HardVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3) {
-  // current version tooks about 0.04s for one frame on cpu
-  // check device
-
-  at::cuda::CUDAGuard device_guard(points.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-
-  const float voxel_x = voxel_size[0];
-  const float voxel_y = voxel_size[1];
-  const float voxel_z = voxel_size[2];
-  const float coors_x_min = coors_range[0];
-  const float coors_y_min = coors_range[1];
-  const float coors_z_min = coors_range[2];
-  const float coors_x_max = coors_range[3];
-  const float coors_y_max = coors_range[4];
-  const float coors_z_max = coors_range[5];
-
-  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
-  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
-  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
-
-  // map points to voxel coors
-  at::Tensor temp_coors =
-      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
-
-  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
-  dim3 block(512);
-
-  // 1. link point to corresponding voxel coors
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "hard_voxelize_kernel", ([&] {
-        dynamic_voxelize_kernel<scalar_t, int><<<grid, block, 0, stream>>>(
-            points.contiguous().data_ptr<scalar_t>(),
-            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
-            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
-            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
-            NDim);
-      }));
-
-  AT_CUDA_CHECK(cudaGetLastError());
-
-  // 2. map point to the idx of the corresponding voxel, find duplicate coor
-  // create some temporary variables
-  auto point_to_pointidx = -at::ones(
-      {
-          num_points,
-      },
-      points.options().dtype(at::kInt));
-  auto point_to_voxelidx = -at::ones(
-      {
-          num_points,
-      },
-      points.options().dtype(at::kInt));
-
-  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
-  dim3 map_block(512);
-
-  AT_DISPATCH_ALL_TYPES(
-      temp_coors.scalar_type(), "determin_duplicate", ([&] {
-        point_to_voxelidx_kernel<int><<<map_grid, map_block, 0, stream>>>(
-            temp_coors.contiguous().data_ptr<int>(),
-            point_to_voxelidx.contiguous().data_ptr<int>(),
-            point_to_pointidx.contiguous().data_ptr<int>(), max_points,
-            max_voxels, num_points, NDim);
-      }));
-
-  AT_CUDA_CHECK(cudaGetLastError());
-
-  // 3. determine voxel num and voxel's coor index
-  // make the logic in the CUDA device could accelerate about 10 times
-  auto coor_to_voxelidx = -at::ones(
-      {
-          num_points,
-      },
-      points.options().dtype(at::kInt));
-  auto voxel_num = at::zeros(
-      {
-          1,
-      },
-      points.options().dtype(at::kInt));  // must be zero from the beginning
-
-  AT_DISPATCH_ALL_TYPES(temp_coors.scalar_type(), "determin_duplicate", ([&] {
-                          determin_voxel_num<int><<<1, 1, 0, stream>>>(
-                              num_points_per_voxel.contiguous().data_ptr<int>(),
-                              point_to_voxelidx.contiguous().data_ptr<int>(),
-                              point_to_pointidx.contiguous().data_ptr<int>(),
-                              coor_to_voxelidx.contiguous().data_ptr<int>(),
-                              voxel_num.contiguous().data_ptr<int>(),
-                              max_points, max_voxels, num_points);
-                        }));
-
-  AT_CUDA_CHECK(cudaGetLastError());
-
-  // 4. copy point features to voxels
-  // Step 4 & 5 could be parallel
-  auto pts_output_size = num_points * num_features;
-  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
-  dim3 cp_block(512);
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "assign_point_to_voxel", ([&] {
-        assign_point_to_voxel<float, int><<<cp_grid, cp_block, 0, stream>>>(
-            pts_output_size, points.contiguous().data_ptr<float>(),
-            point_to_voxelidx.contiguous().data_ptr<int>(),
-            coor_to_voxelidx.contiguous().data_ptr<int>(),
-            voxels.contiguous().data_ptr<float>(), max_points, num_features,
-            num_points, NDim);
-      }));
-  //   cudaDeviceSynchronize();
-  //   AT_CUDA_CHECK(cudaGetLastError());
-
-  // 5. copy coors of each voxels
-  auto coors_output_size = num_points * NDim;
-  dim3 coors_cp_grid(
-      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
-  dim3 coors_cp_block(512);
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "assign_point_to_voxel", ([&] {
-        assign_voxel_coors<float, int>
-            <<<coors_cp_grid, coors_cp_block, 0, stream>>>(
-                coors_output_size, temp_coors.contiguous().data_ptr<int>(),
-                point_to_voxelidx.contiguous().data_ptr<int>(),
-                coor_to_voxelidx.contiguous().data_ptr<int>(),
-                coors.contiguous().data_ptr<int>(), num_points, NDim);
-      }));
-
-  AT_CUDA_CHECK(cudaGetLastError());
-
-  auto voxel_num_cpu = voxel_num.to(at::kCPU);
-  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
-
-  return voxel_num_int;
-}
-
-int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3) {
-  at::cuda::CUDAGuard device_guard(points.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-
-  if (num_points == 0) return 0;
-
-  dim3 blocks(
-      std::min(at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));
-  dim3 threads(THREADS_PER_BLOCK);
-
-  const float voxel_x = voxel_size[0];
-  const float voxel_y = voxel_size[1];
-  const float voxel_z = voxel_size[2];
-  const float coors_x_min = coors_range[0];
-  const float coors_y_min = coors_range[1];
-  const float coors_z_min = coors_range[2];
-  const float coors_x_max = coors_range[3];
-  const float coors_y_max = coors_range[4];
-  const float coors_z_max = coors_range[5];
-
-  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
-  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
-  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
-
-  // map points to voxel coors
-  at::Tensor temp_coors =
-      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
-
-  // 1. link point to corresponding voxel coors
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "hard_voxelize_kernel", ([&] {
-        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
-            points.contiguous().data_ptr<scalar_t>(),
-            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
-            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
-            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
-            NDim);
-      }));
-
-  at::Tensor coors_map;
-  at::Tensor reduce_count;
-
-  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
-
-  std::tie(temp_coors, coors_map, reduce_count) =
-      at::unique_dim(coors_clean, 0, true, true, false);
-
-  if (temp_coors[0][0].lt(0).item<bool>()) {
-    // the first element of temp_coors is (-1,-1,-1) and should be removed
-    temp_coors = temp_coors.slice(0, 1);
-    coors_map = coors_map - 1;
-  }
-
-  int num_coors = temp_coors.size(0);
-  temp_coors = temp_coors.to(at::kInt);
-  coors_map = coors_map.to(at::kInt);
-
-  at::Tensor coors_count = at::zeros({1}, coors_map.options());
-  at::Tensor coors_order = at::empty({num_coors}, coors_map.options());
-  at::Tensor pts_id = at::zeros({num_points}, coors_map.options());
-  reduce_count = at::zeros({num_coors}, coors_map.options());
-
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "get_assign_pos", ([&] {
-        nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(
-            num_points, coors_map.contiguous().data_ptr<int32_t>(),
-            pts_id.contiguous().data_ptr<int32_t>(),
-            coors_count.contiguous().data_ptr<int32_t>(),
-            reduce_count.contiguous().data_ptr<int32_t>(),
-            coors_order.contiguous().data_ptr<int32_t>());
-      }));
-
-  AT_DISPATCH_ALL_TYPES(
-      points.scalar_type(), "assign_point_to_voxel", ([&] {
-        nondeterministic_assign_point_voxel<scalar_t>
-            <<<blocks, threads, 0, stream>>>(
-                num_points, points.contiguous().data_ptr<scalar_t>(),
-                coors_map.contiguous().data_ptr<int32_t>(),
-                pts_id.contiguous().data_ptr<int32_t>(),
-                temp_coors.contiguous().data_ptr<int32_t>(),
-                reduce_count.contiguous().data_ptr<int32_t>(),
-                coors_order.contiguous().data_ptr<int32_t>(),
-                voxels.contiguous().data_ptr<scalar_t>(),
-                coors.contiguous().data_ptr<int32_t>(),
-                num_points_per_voxel.contiguous().data_ptr<int32_t>(),
-                max_voxels, max_points, num_features, NDim);
-      }));
-  AT_CUDA_CHECK(cudaGetLastError());
-  return max_voxels < num_coors ? max_voxels : num_coors;
-}
-
-void DynamicVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &coors,
-    const std::vector<float> voxel_size, const std::vector<float> coors_range,
-    const int NDim = 3) {
-  // current version tooks about 0.04s for one frame on cpu
-  // check device
-
-  at::cuda::CUDAGuard device_guard(points.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-
-  const float voxel_x = voxel_size[0];
-  const float voxel_y = voxel_size[1];
-  const float voxel_z = voxel_size[2];
-  const float coors_x_min = coors_range[0];
-  const float coors_y_min = coors_range[1];
-  const float coors_z_min = coors_range[2];
-  const float coors_x_max = coors_range[3];
-  const float coors_y_max = coors_range[4];
-  const float coors_z_max = coors_range[5];
-
-  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
-  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
-  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
-
-  const int col_blocks = at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK);
-  dim3 blocks(col_blocks);
-  dim3 threads(THREADS_PER_BLOCK);
-
-  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
-    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
-        points.contiguous().data_ptr<scalar_t>(),
-        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
-        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
-        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
-  });
-
-  AT_CUDA_CHECK(cudaGetLastError());
-}
diff --git a/assets/cuda/mmcv/voxelization_cuda_kernel.cuh b/assets/cuda/mmcv/voxelization_cuda_kernel.cuh
deleted file mode 100644
index 021b488..0000000
--- a/assets/cuda/mmcv/voxelization_cuda_kernel.cuh
+++ /dev/null
@@ -1,216 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-#ifndef VOXELIZATION_CUDA_KERNEL_CUH
-#define VOXELIZATION_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
-
-template <typename T, typename T_int>
-__global__ void dynamic_voxelize_kernel(
-    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
-    const float voxel_z, const float coors_x_min, const float coors_y_min,
-    const float coors_z_min, const float coors_x_max, const float coors_y_max,
-    const float coors_z_max, const int grid_x, const int grid_y,
-    const int grid_z, const int num_points, const int num_features,
-    const int NDim) {
-  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
-  CUDA_1D_KERNEL_LOOP(index, num_points) {
-    // To save some computation
-    auto points_offset = points + index * num_features;
-    auto coors_offset = coors + index * NDim;
-    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);
-    if (c_x < 0 || c_x >= grid_x) {
-      coors_offset[0] = -1;
-      continue;
-    }
-
-    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);
-    if (c_y < 0 || c_y >= grid_y) {
-      coors_offset[0] = -1;
-      coors_offset[1] = -1;
-      continue;
-    }
-
-    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);
-    if (c_z < 0 || c_z >= grid_z) {
-      coors_offset[0] = -1;
-      coors_offset[1] = -1;
-      coors_offset[2] = -1;
-    } else {
-      coors_offset[0] = c_z;
-      coors_offset[1] = c_y;
-      coors_offset[2] = c_x;
-    }
-  }
-}
-
-template <typename T, typename T_int>
-__global__ void assign_point_to_voxel(const int nthreads, const T* points,
-                                      T_int* point_to_voxelidx,
-                                      T_int* coor_to_voxelidx, T* voxels,
-                                      const int max_points,
-                                      const int num_features,
-                                      const int num_points, const int NDim) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
-    int index = thread_idx / num_features;
-
-    int num = point_to_voxelidx[index];
-    int voxelidx = coor_to_voxelidx[index];
-    if (num > -1 && voxelidx > -1) {
-      auto voxels_offset =
-          voxels + voxelidx * max_points * num_features + num * num_features;
-
-      int k = thread_idx % num_features;
-      voxels_offset[k] = points[thread_idx];
-    }
-  }
-}
-
-template <typename T, typename T_int>
-__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
-                                   T_int* point_to_voxelidx,
-                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
-                                   const int num_points, const int NDim) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
-    // if (index >= num_points) return;
-    int index = thread_idx / NDim;
-    int num = point_to_voxelidx[index];
-    int voxelidx = coor_to_voxelidx[index];
-    if (num == 0 && voxelidx > -1) {
-      auto coors_offset = voxel_coors + voxelidx * NDim;
-      int k = thread_idx % NDim;
-      coors_offset[k] = coor[thread_idx];
-    }
-  }
-}
-
-template <typename T_int>
-__global__ void point_to_voxelidx_kernel(const T_int* coor,
-                                         T_int* point_to_voxelidx,
-                                         T_int* point_to_pointidx,
-                                         const int max_points,
-                                         const int max_voxels,
-                                         const int num_points, const int NDim) {
-  CUDA_1D_KERNEL_LOOP(index, num_points) {
-    auto coor_offset = coor + index * NDim;
-    // skip invalid points
-    if (coor_offset[0] == -1) continue;
-
-    int num = 0;
-    int coor_x = coor_offset[0];
-    int coor_y = coor_offset[1];
-    int coor_z = coor_offset[2];
-    // only calculate the coors before this coor[index]
-    for (int i = 0; i < index; ++i) {
-      auto prev_coor = coor + i * NDim;
-      if (prev_coor[0] == -1) continue;
-
-      // Find all previous points that have the same coors
-      // if find the same coor, record it
-      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
-          (prev_coor[2] == coor_z)) {
-        num++;
-        if (num == 1) {
-          // point to the same coor that first show up
-          point_to_pointidx[index] = i;
-        } else if (num >= max_points) {
-          // out of boundary
-          break;
-        }
-      }
-    }
-    if (num == 0) {
-      point_to_pointidx[index] = index;
-    }
-    if (num < max_points) {
-      point_to_voxelidx[index] = num;
-    }
-  }
-}
-
-template <typename T_int>
-__global__ void determin_voxel_num(
-    // const T_int* coor,
-    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
-    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
-    const int max_points, const int max_voxels, const int num_points) {
-  // only calculate the coors before this coor[index]
-  for (int i = 0; i < num_points; ++i) {
-    int point_pos_in_voxel = point_to_voxelidx[i];
-    // record voxel
-    if (point_pos_in_voxel == -1) {
-      // out of max_points or invalid point
-      continue;
-    } else if (point_pos_in_voxel == 0) {
-      // record new voxel
-      int voxelidx = voxel_num[0];
-      if (voxel_num[0] >= max_voxels) continue;
-      voxel_num[0] += 1;
-      coor_to_voxelidx[i] = voxelidx;
-      num_points_per_voxel[voxelidx] = 1;
-    } else {
-      int point_idx = point_to_pointidx[i];
-      int voxelidx = coor_to_voxelidx[point_idx];
-      if (voxelidx != -1) {
-        coor_to_voxelidx[i] = voxelidx;
-        num_points_per_voxel[voxelidx] += 1;
-      }
-    }
-  }
-}
-
-__global__ void nondeterministic_get_assign_pos(
-    const int nthreads, const int32_t* coors_map, int32_t* pts_id,
-    int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    int coors_idx = coors_map[thread_idx];
-    if (coors_idx > -1) {
-      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
-      pts_id[thread_idx] = coors_pts_pos;
-      if (coors_pts_pos == 0) {
-        coors_order[coors_idx] = atomicAdd(coors_count, 1);
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void nondeterministic_assign_point_voxel(
-    const int nthreads, const T* points, const int32_t* coors_map,
-    const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count,
-    const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count,
-    const int max_voxels, const int max_points, const int num_features,
-    const int NDim) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    int coors_idx = coors_map[thread_idx];
-    int coors_pts_pos = pts_id[thread_idx];
-    if (coors_idx > -1 && coors_pts_pos < max_points) {
-      int coors_pos = coors_order[coors_idx];
-      if (coors_pos < max_voxels) {
-        auto voxels_offset =
-            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
-        auto points_offset = points + thread_idx * num_features;
-        for (int k = 0; k < num_features; k++) {
-          voxels_offset[k] = points_offset[k];
-        }
-        if (coors_pts_pos == 0) {
-          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
-          auto coors_offset = coors + coors_pos * NDim;
-          auto coors_in_offset = coors_in + coors_idx * NDim;
-          for (int k = 0; k < NDim; k++) {
-            coors_offset[k] = coors_in_offset[k];
-          }
-        }
-      }
-    }
-  }
-}
-
-#endif  // VOXELIZATION_CUDA_KERNEL_CUH
diff --git a/assets/cuda/mmcv/voxelize.py b/assets/cuda/mmcv/voxelize.py
deleted file mode 100644
index 10a6d5c..0000000
--- a/assets/cuda/mmcv/voxelize.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, List, Tuple, Union
-
-import torch
-from torch import nn
-from torch.autograd import Function
-from torch.nn.modules.utils import _pair
-
-# from utils import ext_loader
-import importlib
-def load_ext(name, funcs):
-    ext = importlib.import_module('mmcv.' + name)
-    for fun in funcs:
-        assert hasattr(ext, fun), f'{fun} miss in module {name}'
-    return ext
-
-ext_module = load_ext(
-    '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward'])
-
-
-class _Voxelization(Function):
-
-    @staticmethod
-    def forward(
-            ctx: Any,
-            points: torch.Tensor,
-            voxel_size: Union[tuple, float],
-            coors_range: Union[tuple, float],
-            max_points: int = 35,
-            max_voxels: int = 20000,
-            deterministic: bool = True) -> Union[Tuple[torch.Tensor], Tuple]:
-        """Convert kitti points(N, >=3) to voxels.
-
-        Args:
-            points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points
-                and points[:, 3:] contain other information like reflectivity.
-            voxel_size (tuple or float): The size of voxel with the shape of
-                [3].
-            coors_range (tuple or float): The coordinate range of voxel with
-                the shape of [6].
-            max_points (int, optional): maximum points contained in a voxel. if
-                max_points=-1, it means using dynamic_voxelize. Default: 35.
-            max_voxels (int, optional): maximum voxels this function create.
-                for second, 20000 is a good choice. Users should shuffle points
-                before call this function because max_voxels may drop points.
-                Default: 20000.
-            deterministic: bool. whether to invoke the non-deterministic
-                version of hard-voxelization implementations. non-deterministic
-                version is considerablly fast but is not deterministic. only
-                affects hard voxelization. default True. for more information
-                of this argument and the implementation insights, please refer
-                to the following links:
-                https://github.com/open-mmlab/mmdetection3d/issues/894
-                https://github.com/open-mmlab/mmdetection3d/pull/904
-                it is an experimental feature and we will appreciate it if
-                you could share with us the failing cases.
-
-        Returns:
-            tuple[torch.Tensor]: tuple[torch.Tensor]: A tuple contains three
-            elements. The first one is the output voxels with the shape of
-            [M, max_points, n_dim], which only contain points and returned
-            when max_points != -1. The second is the voxel coordinates with
-            shape of [M, 3]. The last is number of point per voxel with the
-            shape of [M], which only returned when max_points != -1.
-        """
-        if max_points == -1 or max_voxels == -1:
-            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
-            ext_module.dynamic_voxelize_forward(
-                points,
-                torch.tensor(voxel_size, dtype=torch.float),
-                torch.tensor(coors_range, dtype=torch.float),
-                coors,
-                NDim=3)
-            return coors
-        else:
-            voxels = points.new_zeros(
-                size=(max_voxels, max_points, points.size(1)))
-            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
-            num_points_per_voxel = points.new_zeros(
-                size=(max_voxels, ), dtype=torch.int)
-            voxel_num = torch.zeros(size=(), dtype=torch.long)
-            ext_module.hard_voxelize_forward(
-                points,
-                torch.tensor(voxel_size, dtype=torch.float),
-                torch.tensor(coors_range, dtype=torch.float),
-                voxels,
-                coors,
-                num_points_per_voxel,
-                voxel_num,
-                max_points=max_points,
-                max_voxels=max_voxels,
-                NDim=3,
-                deterministic=deterministic)
-            # select the valid voxels
-            voxels_out = voxels[:voxel_num]
-            coors_out = coors[:voxel_num]
-            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
-            return voxels_out, coors_out, num_points_per_voxel_out
-
-
-voxelization = _Voxelization.apply
-
-
-class Voxelization(nn.Module):
-    """Convert kitti points(N, >=3) to voxels.
-
-    Please refer to `Point-Voxel CNN for Efficient 3D Deep Learning
-    <https://arxiv.org/abs/1907.03739>`_ for more details.
-
-    Args:
-        voxel_size (tuple or float): The size of voxel with the shape of [3].
-        point_cloud_range (tuple or float): The coordinate range of voxel with
-            the shape of [6].
-        max_num_points (int): maximum points contained in a voxel. if
-            max_points=-1, it means using dynamic_voxelize.
-        max_voxels (int, optional): maximum voxels this function create.
-            for second, 20000 is a good choice. Users should shuffle points
-            before call this function because max_voxels may drop points.
-            Default: 20000.
-    """
-
-    def __init__(self,
-                 voxel_size: List,
-                 point_cloud_range: List,
-                 max_num_points: int,
-                 max_voxels: Union[tuple, int] = 20000,
-                 deterministic: bool = True):
-        """
-        Args:
-            voxel_size (list): list [x, y, z] size of three dimension
-            point_cloud_range (list):
-                [x_min, y_min, z_min, x_max, y_max, z_max]
-            max_num_points (int): max number of points per voxel
-            max_voxels (tuple or int): max number of voxels in
-                (training, testing) time
-            deterministic: bool. whether to invoke the non-deterministic
-                version of hard-voxelization implementations. non-deterministic
-                version is considerablly fast but is not deterministic. only
-                affects hard voxelization. default True. for more information
-                of this argument and the implementation insights, please refer
-                to the following links:
-                https://github.com/open-mmlab/mmdetection3d/issues/894
-                https://github.com/open-mmlab/mmdetection3d/pull/904
-                it is an experimental feature and we will appreciate it if
-                you could share with us the failing cases.
-        """
-        super().__init__()
-
-        self.voxel_size = voxel_size
-        self.point_cloud_range = point_cloud_range
-        self.max_num_points = max_num_points
-        if isinstance(max_voxels, tuple):
-            self.max_voxels = max_voxels
-        else:
-            self.max_voxels = _pair(max_voxels)
-        self.deterministic = deterministic
-
-        point_cloud_range = torch.tensor(
-            point_cloud_range, dtype=torch.float32)
-        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
-        grid_size = (
-            point_cloud_range[3:] -  # type: ignore
-            point_cloud_range[:3]) / voxel_size  # type: ignore
-        grid_size = torch.round(grid_size).long()
-        input_feat_shape = grid_size[:2]
-        self.grid_size = grid_size
-        # the origin shape is as [x-len, y-len, z-len]
-        # [w, h, d] -> [d, h, w]
-        self.pcd_shape = [*input_feat_shape, 1][::-1]
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if self.training:
-            max_voxels = self.max_voxels[0]
-        else:
-            max_voxels = self.max_voxels[1]
-
-        return voxelization(input, self.voxel_size, self.point_cloud_range,
-                            self.max_num_points, max_voxels,
-                            self.deterministic)
-
-    def __repr__(self):
-        s = self.__class__.__name__ + '('
-        s += 'voxel_size=' + str(self.voxel_size)
-        s += ', point_cloud_range=' + str(self.point_cloud_range)
-        s += ', max_num_points=' + str(self.max_num_points)
-        s += ', max_voxels=' + str(self.max_voxels)
-        s += ', deterministic=' + str(self.deterministic)
-        s += ')'
-        return s
diff --git a/assets/docs/index_eval_v2.pkl b/assets/docs/index_eval_v2.pkl
deleted file mode 100644
index 3bf3e5a..0000000
Binary files a/assets/docs/index_eval_v2.pkl and /dev/null differ
diff --git a/assets/tests/README.md b/assets/tests/README.md
deleted file mode 100644
index ee80f86..0000000
--- a/assets/tests/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-Mics Tests (Speed,)
----
-
-Some speed comparison testing in algorithms and data structures.
-
-
-## Cal Chamfer Distance
-
-Points num: pc0: 88132, pc1: 88101
-
-| Function | Time (ms) |
-| :---: | :---: |
-| CUDA(Old, SCOOP, Batch) | 83.275 |
-| Faiss | 368.269 |
-| Pytorch3D | 61.474 |
-| CUDA(New, NonBatch) Ours| 33.199 |
-| CUDA(New, SharedM) Ours | 17.379 |
-
-
-## Cal HDBSCAN
-
-Points num: pc0: 85396, pc1: 85380
-
-| Function | Time (s) |
-| :---: | :---: |
-| cuml | 1.604 |
-| sklearn | 31.946 |
-| scikit-hdbscan | 3.064 |
-
-dependence: [cuml](https://github.com/rapidsai/cuml)
-```bash
-# cuml
-mamba create -n rapids -c rapidsai -c conda-forge -c nvidia rapids=23.12 python=3.9 cuda-version=11.8 pytorch
-mamba activate rapids
-
-# sklearn
-mamba install -c conda-forge scikit-learn
-
-# scikit-hdbscan (independent version)
-mamba install -c conda-forge hdbscan
-
-```
\ No newline at end of file
diff --git a/assets/tests/chamferdis_speed_test.py b/assets/tests/chamferdis_speed_test.py
deleted file mode 100644
index e53c33a..0000000
--- a/assets/tests/chamferdis_speed_test.py
+++ /dev/null
@@ -1,128 +0,0 @@
-"""
-# Created: 2023-08-09 23:40
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-#
-#
-# Description: Test which existing chamfer distance is faster.
-
-Dependence for this test scripts:
-    * faiss-gpu
-    * Pytorch3d
-    * mmcv
-"""
-import os, sys
-BASE_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '..' ))
-BASEF_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '../..'))
-sys.path.append(BASE_DIR)
-sys.path.append(BASEF_DIR)
-
-import torch
-import numpy as np
-import time
-
-FAISS_TEST = True
-PYTORCH3D_TEST = True
-MMCV_TEST = False
-CUDA_TEST = True
-
-if __name__ == "__main__":
-    pc0 = np.load(f'{BASEF_DIR}/assets/tests/test_pc0.npy')
-    pc1 = np.load(f'{BASEF_DIR}/assets/tests/test_pc1.npy')
-    print('Start status on GPU allocation: {:.3f}MB'.format(torch.cuda.memory_allocated()/1024**2))
-    pc0 = torch.from_numpy(pc0[...,:3]).float().cuda().contiguous()
-    pc1 = torch.from_numpy(pc1[...,:3]).float().cuda().contiguous()
-    pc0.requires_grad = True
-    pc1.requires_grad = True
-
-    print(pc0.shape, "demo data: ", pc0[0])
-    print(pc1.shape, "demo data: ", pc1[0])
-    print('Status after loading data: {:.3f}MB'.format(torch.cuda.memory_allocated()/1024**2))
-
-    time.sleep(1)
-    if FAISS_TEST:
-        print("------ START Faiss Chamfer Distance Cal ------")
-        import faiss
-        import faiss.contrib.torch_utils
-        def faiss_chamfer_distance(pc1, pc2):
-            def faiss_knn(pc1, pc2):
-                res = faiss.StandardGpuResources()
-                index = faiss.GpuIndexFlatL2(res, 3)
-                index.add(pc2)
-                distances, indices = index.search(pc1, 1) # [N_1, 1]
-                return distances
-            pc1_knn = faiss_knn(pc1, pc2)
-            pc2_knn = faiss_knn(pc2, pc1)
-
-            # # NOTE: truncated Chamfer distance.
-            # dist_thd = 2
-            # pc1_knn[pc1_knn >= dist_thd] = 0.0
-            # pc2_knn[pc2_knn >= dist_thd] = 0.0
-
-            # NOTE: Chamfer distance. mean based on pts number
-            cham_pc1 = pc1_knn.mean()
-            cham_pc2 = pc2_knn.mean()
-
-            return cham_pc1 + cham_pc2
-        
-        start_time = time.time()
-        loss = faiss_chamfer_distance(pc0, pc1)
-        print("loss: ", loss)
-        print(f"Faiss Chamfer Distance Cal time: {(time.time() - start_time)*1000:.3f} ms]\n")
-
-    if PYTORCH3D_TEST:
-        print("------ START Pytorch3d Chamfer Distance Cal ------")
-        from src.models.basic.nsfp_module import my_chamfer_fn
-        start_time = time.time()
-        loss0, _ = my_chamfer_fn(pc0.unsqueeze(0), pc1.unsqueeze(0), truncate_dist=False)
-
-        print(f"Pytorch3d Chamfer Distance Cal time: {(time.time() - start_time)*1000:.3f} ms")
-        print("loss: ", loss0)
-        print()
-
-    if MMCV_TEST:
-        # NOTE: mmcv chamfer distance is on x,y only.. That's why the result is not correct.
-        # DONE: the result is not correct.... 
-        print("------ START mmcv 2D Chamfer Distance Cal ------")
-        from mmcv.ops import chamfer_distance
-        start_time = time.time()
-        dis0, dis1, idx0, idx1= chamfer_distance(pc0.unsqueeze(0), pc1.unsqueeze(0))
-        loss_0t1 = torch.sum(dis0) / pc0.shape[0]
-        loss_1t0 = torch.sum(dis1) / pc1.shape[0]
-
-        loss = loss_0t1 + loss_1t0
-        print("loss: ", loss)
-        print(f"mmcv Chamfer Distance Cal time: {(time.time() - start_time)*1000:.3f} ms\n")
-
-    if CUDA_TEST:
-        print("------ START CUDA Chamfer Distance Cal ------")
-        from assets.cuda.chamfer3D import nnChamferDis
-        start_time = time.time()
-        loss = nnChamferDis(truncate_dist=False)(pc0, pc1)
-        print("loss: ", loss)
-        print(f"Chamfer Distance Cal time: {(time.time() - start_time)*1000:.3f} ms")
-        print()
-
-"""
-0: 0.000MB
-torch.Size([88132, 3]) demo data:  tensor([-8.2266,  8.3516,  1.4922], device='cuda:0', grad_fn=<SelectBackward0>)
-torch.Size([88101, 3]) demo data:  tensor([-7.9961,  8.1328,  0.4839], device='cuda:0', grad_fn=<SelectBackward0>)
-1: 2.017MB
-
------- START Faiss Chamfer Distance Cal ------
-loss:  tensor(0.1710, device='cuda:0')
-Faiss Chamfer Distance Cal time: 817.698 ms
-
------- START Pytorch3d Chamfer Distance Cal ------
-Pytorch3d Chamfer Distance Cal time: 68.256 ms
-loss:  tensor(0.1710, device='cuda:0', grad_fn=<AddBackward0>)
-
------- START mmcv 2D Chamfer Distance Cal ------
-loss:  tensor(0.0591, device='cuda:0', grad_fn=<AddBackward0>)
-mmcv Chamfer Distance Cal time: 651.510 ms
-
------- START CUDA Chamfer Distance Cal ------
-Chamfer Distance Cal time: 14.308 ms
-loss:  tensor(0.1710, device='cuda:0', grad_fn=<AddBackward0>)
-
-"""
\ No newline at end of file
diff --git a/assets/tests/hdbscan_speed.py b/assets/tests/hdbscan_speed.py
deleted file mode 100644
index f92580d..0000000
--- a/assets/tests/hdbscan_speed.py
+++ /dev/null
@@ -1,99 +0,0 @@
-"""
-# Created: 2023-12-18 17:23
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-#
-#
-#
-# Description: Test which existing hdbscan speed
-"""
-import os, sys
-BASEF_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '../..'))
-sys.path.append(BASEF_DIR)
-import torch, time
-import numpy as np
-
-# ------ Three different hdbscan implementations ------
-from cuml.cluster import hdbscan
-from sklearn.cluster import HDBSCAN
-import hdbscan as cpu_hdbscan
-
-from src.utils.o3d_view import MyVisualizer, color_map
-import open3d as o3d
-
-MAX_AXIS_RANGE = 60
-
-def vis(pc0, labels, title='HDBSCAN'):
-    # visualize
-    vis = MyVisualizer(view_file=f'{BASEF_DIR}/assets/view/av2.json', window_title=title)
-    pcd = o3d.geometry.PointCloud()
-    num_points = pc0.shape[0]
-    pcd.points = o3d.utility.Vector3dVector(pc0)
-    pcd.colors = o3d.utility.Vector3dVector(np.zeros((num_points, 3)))
-    for i in range(num_points):
-        if labels[i] >= 0:
-            # print(i, labels[i])
-            pcd.colors[i] = color_map[labels[i] % len(color_map)]
-    vis.show([pcd])
-
-if __name__ == "__main__":
-    pc0 = np.load(f'{BASEF_DIR}/assets/tests/test_pc0.npy')
-    pc1 = np.load(f'{BASEF_DIR}/assets/tests/test_pc1.npy')
-    print('0: {:.3f}MB'.format(torch.cuda.memory_allocated()/1024**2))
-    pc0 = torch.from_numpy(pc0[...,:3]).float().cuda().contiguous()
-    pc1 = torch.from_numpy(pc1[...,:3]).float().cuda().contiguous()
-
-    # filter out MAX_AXIS_RANGE
-    pc0 = pc0[torch.logical_and(torch.abs(pc0[:,0])<MAX_AXIS_RANGE, torch.abs(pc0[:,1])<MAX_AXIS_RANGE)]
-    pc1 = pc1[torch.logical_and(torch.abs(pc1[:,0])<MAX_AXIS_RANGE, torch.abs(pc1[:,1])<MAX_AXIS_RANGE)]
-    print(pc0.shape, "demo data: ", pc0[0])
-    print(pc1.shape, "demo data: ", pc1[0])
-    print('1: {:.3f}MB'.format(torch.cuda.memory_allocated()/1024**2))
-
-    print("------ START GPU cuml HDBSCAN Clustering ------")
-    start_t = time.time()
-    hdb = hdbscan.HDBSCAN(min_cluster_size=20, cluster_selection_epsilon=0.7)
-    for pc in [pc0, pc1]:
-        hdb.fit(pc)
-        labels = hdb.labels_
-    labels = np.array(hdb.labels_.get())
-    print(f'cuml hdbscan time cost: {time.time()-start_t:.3f}s')
-
-    pc0 = pc0.cpu().numpy()
-    pc1 = pc1.cpu().numpy()
-    # vis(pc1, labels, title='cuml HDBSCAN')
-
-    print("------ START CPU sklearn HDBSCAN ------")
-    start_t = time.time()
-    hdb = HDBSCAN(min_cluster_size=20, cluster_selection_epsilon=0.7, n_jobs=-1)
-    for pc in [pc0, pc1]:
-        hdb.fit(pc)
-        labels = hdb.labels_
-    print(f'sklearn hdbscan time cost: {time.time()-start_t:.3f}s')
-    # vis(pc1, labels, title='sklearn HDBSCAN')
-
-    print("------ START CPU scikit-learn HDBSCAN ------")
-    start_t = time.time()
-    cluster = cpu_hdbscan.HDBSCAN(min_cluster_size=20, cluster_selection_epsilon=0.7)
-    for pc in [pc0, pc1]:
-        cluster_labels = cluster.fit_predict(pc)
-    print(f'scikit-learn hdbscan time cost: {time.time()-start_t:.3f}s')
-    # vis(pc1, cluster_labels, title='scikit-learn HDBSCAN')
-        
-"""
-Output in my desktop with a 3090 GPU:
-python assets/tests/hdbscan_speed.py
-----------------------------------------
-
-0: 0.000MB
-torch.Size([85396, 3]) demo data:  tensor([-8.2266,  8.3516,  1.4922], device='cuda:0')
-torch.Size([85380, 3]) demo data:  tensor([-7.9961,  8.1328,  0.4839], device='cuda:0')
-1: 1.955MB
------- START GPU cuml HDBSCAN Clustering ------
-cuml hdbscan time cost: 1.973s
------- START CPU sklearn HDBSCAN ------
-sklearn hdbscan time cost: 31.773s
------- START CPU scikit-learn HDBSCAN ------
-scikit-learn hdbscan time cost: 3.188s
-
-"""
\ No newline at end of file
diff --git a/assets/tests/test_pc0.npy b/assets/tests/test_pc0.npy
deleted file mode 100644
index 24443b7..0000000
Binary files a/assets/tests/test_pc0.npy and /dev/null differ
diff --git a/assets/tests/test_pc1.npy b/assets/tests/test_pc1.npy
deleted file mode 100644
index f09e4fc..0000000
Binary files a/assets/tests/test_pc1.npy and /dev/null differ
diff --git a/assets/view/av2.json b/assets/view/av2.json
deleted file mode 100644
index b1bf1bb..0000000
--- a/assets/view/av2.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-	"class_name" : "ViewTrajectory",
-	"interval" : 29,
-	"is_loop" : false,
-	"trajectory" : 
-	[
-		{
-			"boundingbox_max" : [ 211.125, 117.1875, 20.53125 ],
-			"boundingbox_min" : [ -215.25, -166.625, -3.392578125 ],
-			"field_of_view" : 90.0,
-			"front" : [ -0.78264077936294429, -0.0063155949007044129, 0.62244158259166249 ],
-			"lookat" : [ 16.764474584958553, 0.042979235705968843, -6.3527807404873249 ],
-			"up" : [ 0.62244710369604039, 0.0012897206826678895, 0.78266080757948497 ],
-			"zoom" : 0.080000000000000002
-		}
-	],
-	"version_major" : 1,
-	"version_minor" : 0
-}
\ No newline at end of file
diff --git a/conf/config.yaml b/conf/config.yaml
deleted file mode 100644
index 75fbf10..0000000
--- a/conf/config.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-
-defaults:
-  - hydra: default
-  - model: deflow
-  
-slurm_id: 00000
-
-wandb_mode: offline # [offline, disabled, online]
-wandb_project_name: seflow
-
-train_data: /home/kin/data/av2/preprocess_v2/demo/sensor/train
-val_data: /home/kin/data/av2/preprocess_v2/demo/sensor/val
-
-output: ${model.name}-${slurm_id}
-
-checkpoint: 
-pretrained_weights: 
-eval_only: False
-
-# trainer
-num_workers: 8
-gpus: 1 # will change in code: torch.cuda.device_count()
-sync_bn: true
-batch_size: 1
-epochs: 3
-gradient_clip_val: 5.0
-
-# optimizer ==> Adam
-lr: 2e-6
-loss_fn: seflowLoss # choices: [ff3dLoss, zeroflowLoss, deflowLoss, seflowLoss]
-add_seloss: # {chamfer_dis: 1.0, static_flow_loss: 1.0, dynamic_chamfer_dis: 1.0, cluster_based_pc0pc1: 1.0}
-
-# log settings
-seed: 42069
-log_every: 10 # steps epochs*dataset_size/batch_size
-val_every: 3 # epochs
-save_top_model: 5 # top_k model will be saved.
-
-# -----> Model and Task Parameters
-voxel_size: [0.2, 0.2, 6]
-point_cloud_range: [-51.2, -51.2, -3, 51.2, 51.2, 3]
-num_frames: 2
\ No newline at end of file
diff --git a/conf/eval.yaml b/conf/eval.yaml
deleted file mode 100644
index cec0559..0000000
--- a/conf/eval.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-
-dataset_path: /home/kin/data/av2/preprocess_v2/sensor
-checkpoint: /home/kin/model_zoo/deflow.ckpt
-av2_mode: val # [val, test]
-save_res: False # [True, False]
-
-leaderboard_version: 1 # [1, 2]
-supervised_flag: True # [True, False], whether you use any label from the dataset
-
-# no need to change
-slurm_id: 00000
-output: ${model.name}-${slurm_id}
-gpus: 1
-seed: 42069
-eval_only: True
-wandb_mode: offline # [offline, disabled, online]
-defaults:
-  - hydra: default
-  - model: deflow
\ No newline at end of file
diff --git a/conf/hydra/default.yaml b/conf/hydra/default.yaml
deleted file mode 100644
index bddcd90..0000000
--- a/conf/hydra/default.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-run:
-  dir: logs/jobs/${output}/${now:%m-%d-%H-%M}
\ No newline at end of file
diff --git a/conf/model/deflow.yaml b/conf/model/deflow.yaml
deleted file mode 100644
index 83afb0a..0000000
--- a/conf/model/deflow.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: deflow
-
-target:
-  _target_: src.models.DeFlow
-  decoder_option: gru # choices: [linear, gru]
-  num_iters: 4
-  voxel_size: ${voxel_size}
-  point_cloud_range: ${point_cloud_range}
-
-val_monitor: val/Dynamic/Mean
\ No newline at end of file
diff --git a/conf/model/fastflow3d.yaml b/conf/model/fastflow3d.yaml
deleted file mode 100644
index b0ea5f9..0000000
--- a/conf/model/fastflow3d.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-name: fastflow3d
-
-target:
-  _target_: src.models.FastFlow3D
-  voxel_size: ${voxel_size}
-  point_cloud_range: ${point_cloud_range}
-  
-val_monitor: val/Dynamic/Mean
\ No newline at end of file
diff --git a/conf/save.yaml b/conf/save.yaml
deleted file mode 100644
index 253843e..0000000
--- a/conf/save.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-dataset_path: /home/kin/data/av2/preprocess_v2/demo/sensor/val
-checkpoint: /home/kin/model_zoo/seflow_best.ckpt
-res_name: # if None will directly be the `model_name.ckpt` in checkpoint path
-
-
-# no need to change
-defaults:
-  - hydra: default
-  - model: deflow
-seed: 42069
-gpus: 1
-slurm_id: 00000
-output: ${model.name}-${slurm_id}
\ No newline at end of file
diff --git a/dataprocess/README.md b/dataprocess/README.md
deleted file mode 100644
index 5be29d4..0000000
--- a/dataprocess/README.md
+++ /dev/null
@@ -1,153 +0,0 @@
-Dataset
----
-
-README for downloading and preprocessing the dataset. We includes waymo, argoverse 2.0 and nuscenes dataset in our project.
-
-- [Download](#download): includes how to download the dataset.
-- [Process](#process): run script to preprocess the dataset.
-
-We've updated the process dataset for:
-
-- [x] Argoverse 2.0: check [here](#argoverse-20). The process script Involved from [DeFlow](https://github.com/KTH-RPL/DeFlow).
-- [x] Waymo: check [here](#waymo-dataset). The process script was involved from [SeFlow](https://github.com/KTH-RPL/SeFlow).
-- [ ] nuScenes: done coding, public after review. Will be involved later by another paper.
-
-If you want to use all datasets above, there is a specific process environment in [envprocess.yaml](../envprocess.yaml) to install all the necessary packages. As Waymo package have different configuration and conflict with the main environment. Setup through the following command:
-
-```bash
-conda env create -f envprocess.yaml
-conda activate dataprocess
-# NOTE we need **manually reinstall numpy** (higher than 1.22)
-# * since waymo package force numpy==1.21.5, BUT!
-# * hdbscan w. numpy<1.22.0 will raise error: 'numpy.float64' object cannot be interpreted as an integer
-# * av2 need numpy >=1.22.0, waymo with numpy==1.22.0 will be fine on code running.
-pip install numpy==1.22
-```
-
-## Download
-
-### Argoverse 2.0
-
-Install their download tool `s5cmd`, already in our envprocess.yaml. Then download the dataset:
-```bash
-# train is really big (750): totally 966 GB
-s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/sensor/train/*" av2/sensor/train 
-
-# val (150) and test (150): totally 168GB + 168GB
-s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/sensor/val/*" av2/sensor/val
-s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/sensor/test/*" av2/sensor/test
-
-# for local and online eval mask from official repo
-s5cmd --no-sign-request cp "s3://argoverse/tasks/3d_scene_flow/zips/*" .
-```
-
-Then to quickly pre-process the data, we can [read these commands](#process) on how to generate the pre-processed data for training and evaluation. This will take around 0.5-2 hour for the whole dataset (train & val) based on how powerful your CPU is.
-
-Optional: More [self-supervised data in AV2 LiDAR only](https://www.argoverse.org/av2.html#lidar-link), note: It **does not** include **imagery or 3D annotations**. The dataset is designed to support research into self-supervised learning in the lidar domain, as well as point cloud forecasting. 
-```bash
-# train is really big (16000): totally 4 TB
-s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/lidar/train/*" av2/lidar/train
-
-# val (2000): totally 0.5 TB
-s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/lidar/val/*" av2/lidar/val
-
-# test (2000): totally 0.5 TB
-s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/lidar/test/*" av2/lidar/test
-``` 
-
-#### Dataset frames
-
-<!-- Note that some frames in LiDAR don't have any point cloud.... we didn't remove them in the total num. -->
-
-| Dataset      | # Total Scene | # Total Frames |
-| ------------ | ------------- | -------------- |
-| Sensor/train | 750           | 110071         |
-| Sensor/val   | 150           | 23547          |
-| Sensor/test  | 150           | 23574          |
-| LiDAR/train  | 16000         | -              |
-| LiDAR/val    | 2000          | 597590         |
-| LiDAR/test   | 2000          | 597575         |
-
-### nuScenes
-
-You need sign up an account at [nuScenes](https://www.nuscenes.org/) to download the dataset from [https://www.nuscenes.org/nuscenes#download](https://www.nuscenes.org/nuscenes#download) Full dataset (v1.0), you can choose to download lidar only. Click donwload mini split and unzip the file to the `nuscenes` folder if you want to test.
-
-![](../assets/docs/nuscenes.png)
-
-
-### Waymo Dataset
-
-To download the Waymo dataset, you need to register an account at [Waymo Open Dataset](https://waymo.com/open/). You also need to install gcloud SDK and authenticate your account. Please refer to [this page](https://cloud.google.com/sdk/docs/install) for more details. 
-
-For cluster without root user, check [here sdk tar gz](https://cloud.google.com/sdk/docs/downloads-versioned-archives).
-
-Website to check their file: [https://console.cloud.google.com/storage/browser/waymo_open_dataset_scene_flow](https://console.cloud.google.com/storage/browser/waymo_open_dataset_scene_flow)
-
-The thing we need is all things about `lidar`, to download the data, you can use the following command:
-
-```bash
-gsutil -m cp -r "gs://waymo_open_dataset_scene_flow/valid" .
-gsutil -m cp -r "gs://waymo_open_dataset_scene_flow/train" .
-```
-
-And flowlabel data can be downloaded here with ground segmentation by HDMap follow the same style of [ZeroFlow](https://github.com/kylevedder/zeroflow/blob/master/data_prep_scripts/waymo/extract_flow_and_remove_ground.py).
-
-You can download the processed map folder here to free yourself downloaded another type of data again:
-
-```bash
-wget https://huggingface.co/kin-zhang/OpenSceneFlow/resolve/main/waymo_map.tar.gz
-tar -xvf waymo_map.tar.gz -C /home/kin/data/waymo/flowlabel
-# you will see there is a `map` folder in the `flowlabel` folder now.
-```
-
-#### Dataset frames
-
-| Dataset | # Total Scene | # Total Frames |
-| ------- | ------------- | -------------- |
-| train   | 799           | 155687         |
-| val     | 203           | 39381          |
-
-## Process
-This directory contains the scripts to preprocess the datasets. 
-
-- `extract_av2.py`: Process the datasets in Argoverse 2.0.
-- `extract_nus.py`: Process the datasets in nuScenes.
-- `extract_waymo.py`: Process the datasets in Waymo.
-
-Example Running command:`
-```bash
-# av2:
-python dataprocess/extract_av2.py --av2_type sensor --data_mode train --argo_dir /home/kin/data/av2 --output_dir /home/kin/data/av2/preprocess
-
-# waymo:
-python dataprocess/extract_waymo.py --mode train --flow_data_dir /home/kin/data/waymo/flowlabel --map_dir /home/kin/data/waymo/flowlabel/map --output_dir /home/kin/data/waymo/preprocess  --nproc 48
-```
-
-All these preprocess scripts will generate the same format `.h5` file. The file contains the following in codes:
-
-File: `[*:logid].h5` file named in logid. Every timestamp is the key of group (f[key]).
-
-```python
-def process_log(data_dir: Path, log_id: str, output_dir: Path, n: Optional[int] = None) :
-    def create_group_data(group, pc, gm, pose, flow_0to1=None, flow_valid=None, flow_category=None, ego_motion=None):
-        group.create_dataset('lidar', data=pc.astype(np.float32))
-        group.create_dataset('ground_mask', data=gm.astype(bool))
-        group.create_dataset('pose', data=pose.astype(np.float32))
-        if flow_0to1 is not None:
-            # ground truth flow information
-            group.create_dataset('flow', data=flow_0to1.astype(np.float32))
-            group.create_dataset('flow_is_valid', data=flow_valid.astype(bool))
-            group.create_dataset('flow_category_indices', data=flow_category.astype(np.uint8))
-            group.create_dataset('ego_motion', data=ego_motion.astype(np.float32))
-```
-
-After preprocessing, all data can use the same dataloader to load the data. As already in our DeFlow code.
-
-Or you can run testing file to visualize the data. 
-
-```bash
-# view gt flow
-python tools/visualization.py --data_dir /home/kin/data/av2/preprocess/sensor/mini --res_name flow
-
-python tools/visualization.py --data_dir /home/kin/data/waymo/preprocess/val --res_name flow
-```
\ No newline at end of file
diff --git a/dataprocess/extract_av2.py b/dataprocess/extract_av2.py
deleted file mode 100644
index e397bab..0000000
--- a/dataprocess/extract_av2.py
+++ /dev/null
@@ -1,293 +0,0 @@
-"""
-# Created: 2023-11-01 17:02
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-#
-# This file is part of DeFlow (https://github.com/KTH-RPL/DeFlow).
-# If you find this repo helpful, please cite the respective publication as 
-# listed on the above website.
-
-# Description: Preprocess Data, save as h5df format for faster loading
-# Reference: 
-#   * ZeroFlow data preprocessing work: https://github.com/kylevedder/argoverse2-sf
-#   * Argoverse API source code: https://github.com/argoverse/av2-api
-"""
-
-import os
-os.environ["OMP_NUM_THREADS"] = "1"
-
-from av2.datasets.sensor.av2_sensor_dataloader import convert_pose_dataframe_to_SE3
-from av2.structures.sweep import Sweep
-from av2.structures.cuboid import CuboidList, Cuboid
-from av2.utils.io import read_feather
-from av2.map.map_api import ArgoverseStaticMap
-from av2.geometry.se3 import SE3
-from av2.datasets.sensor.constants import AnnotationCategories
-
-import multiprocessing
-from pathlib import Path
-from multiprocessing import Pool, current_process
-from typing import Optional, Tuple, Dict, Union, Final
-from tqdm import tqdm
-import numpy as np
-import fire, time, h5py
-from collections import defaultdict
-import pickle
-from zipfile import ZipFile
-import pandas as pd
-
-import os, sys
-BASE_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '..' ))
-sys.path.append(BASE_DIR)
-from dataprocess.misc_data import create_reading_index
-from src.utils.av2_eval import read_ego_SE3_sensor
-
-BOUNDING_BOX_EXPANSION: Final = 0.2
-CATEGORY_TO_INDEX: Final = {
-    **{"NONE": 0},
-    **{k.value: i + 1 for i, k in enumerate(AnnotationCategories)},
-}
-
-def create_eval_mask(data_mode: str, output_dir_: Path, mask_dir: str):
-    """
-    Need download the official mask file run: `s5cmd --no-sign-request cp "s3://argoverse/tasks/3d_scene_flow/zips/*" .`
-    Check more in our assets/README.md
-    """
-    mask_file_path = Path(mask_dir) / f"{data_mode}-masks.zip"
-    if not mask_file_path.exists():
-        print(f'{mask_file_path} not found, please download the mask file for official evaluation.')
-        return
-    # extract the mask file
-    with ZipFile(mask_file_path, 'r') as zipObj:
-        zipObj.extractall(Path(mask_dir) / f"{data_mode}-masks")
-    
-    data_index = []
-    # list scene ids
-    scene_ids = os.listdir(Path(mask_dir) / f"{data_mode}-masks")
-    for scene_id in tqdm(scene_ids, desc=f'Create {data_mode} eval mask', ncols=100):
-        timestamps = sorted([int(file.replace('.feather', ''))
-                        for file in os.listdir(Path(mask_dir) / f"{data_mode}-masks" / scene_id)
-                        if file.endswith('.feather')])
-        if not os.path.exists(output_dir_ / f'{scene_id}.h5'):
-            continue
-        with h5py.File(output_dir_ / f'{scene_id}.h5', 'r+') as f:
-            for ts in timestamps:
-                key = str(ts)
-                if key not in f.keys():
-                    print(f'{scene_id}/{key} not found')
-                    continue
-                group = f[key]
-                mask = pd.read_feather(Path(mask_dir) / f"{data_mode}-masks" / scene_id / f"{key}.feather").to_numpy().astype(bool)
-                group.create_dataset('eval_mask', data=mask)
-                data_index.append([scene_id, key])
-
-    with open(output_dir_/'index_eval.pkl', 'wb') as f:
-        pickle.dump(data_index, f)
-        print(f"Create reading index Successfully")
-
-def read_pose_pc_ground(data_dir: Path, log_id: str, timestamp: int, avm: ArgoverseStaticMap):
-    log_poses_df = read_feather(data_dir / log_id / "city_SE3_egovehicle.feather")
-    # more detail: https://argoverse.github.io/user-guide/datasets/lidar.html#sensor-suite
-    ego2sensor_pose = read_ego_SE3_sensor((data_dir / log_id))['up_lidar']
-    filtered_log_poses_df = log_poses_df[log_poses_df["timestamp_ns"].isin([timestamp])]
-    pose = convert_pose_dataframe_to_SE3(filtered_log_poses_df.loc[filtered_log_poses_df["timestamp_ns"] == timestamp])
-    pc = Sweep.from_feather(data_dir / log_id / "sensors" / "lidar" / f"{timestamp}.feather").xyz
-    # transform to city coordinate since sweeps[0].xyz is in ego coordinate to get ground mask
-    is_ground = avm.get_ground_points_boolean(pose.transform_point_cloud(pc))
-
-    # NOTE(SeFlow): transform to sensor coordinate, since some ray-casting based methods need sensor coordinate
-    pc = ego2sensor_pose.inverse().transform_point_cloud(pc) 
-    return pc, pose, is_ground
-
-def compute_sceneflow(data_dir: Path, log_id: str, timestamps: Tuple[int, int]) -> Dict[str, Union[np.ndarray, SE3]]:
-    """Compute sceneflow between the sweeps at the given timestamps.
-        Args:
-          data_dir: Argoverse 2.0 directory, e.g. /home/kin/data/av2/sensor/train
-          log_id: unique id.
-          timestamps: the timestamps of the lidar sweeps to compute flow between
-        Returns:
-          Dictionary with fields:
-            pcl_0: Nx3 array containing the points at time 0
-            pcl_1: Mx3 array containing the points at time 1
-            flow_0_1: Nx3 array containing flow from timestamp 0 to 1
-            flow_1_0: Mx3 array containing flow from timestamp 1 to 0
-            valid_0: Nx1 array indicating if the returned flow from 0 to 1 is valid (1 for valid, 0 otherwise)
-            valid_1: Mx1 array indicating if the returned flow from 1 to 0 is valid (1 for valid, 0 otherwise)
-            classes_0: Nx1 array containing the class ids for each point in sweep 0
-            classes_1: Nx1 array containing the class ids for each point in sweep 0
-            pose_0: SE3 pose at time 0
-            pose_1: SE3 pose at time 1
-            ego_motion: SE3 motion from sweep 0 to sweep 1
-    """
-    def compute_flow(sweeps, cuboids, poses):
-        ego1_SE3_ego0 = poses[1].inverse().compose(poses[0])
-        # Convert to float32s
-        ego1_SE3_ego0.rotation = ego1_SE3_ego0.rotation.astype(np.float32)
-        ego1_SE3_ego0.translation = ego1_SE3_ego0.translation.astype(np.float32)
-        
-        flow = ego1_SE3_ego0.transform_point_cloud(sweeps[0].xyz) -  sweeps[0].xyz
-        # Convert to float32s
-        flow = flow.astype(np.float32)
-        
-        valid = np.ones(len(sweeps[0].xyz), dtype=np.bool_)
-        # classes = -np.ones(len(sweeps[0].xyz), dtype=np.int8)
-        classes = np.zeros(len(sweeps[0].xyz), dtype=np.uint8)
-        
-        for id in cuboids[0]:
-            c0 = cuboids[0][id]
-            c0.length_m += BOUNDING_BOX_EXPANSION # the bounding boxes are a little too tight and some points are missed
-            c0.width_m += BOUNDING_BOX_EXPANSION
-            obj_pts, obj_mask = c0.compute_interior_points(sweeps[0].xyz)
-            classes[obj_mask] = CATEGORY_TO_INDEX[str(c0.category)]
-        
-            if id in cuboids[1]:
-                c1 = cuboids[1][id]
-                c1_SE3_c0 = c1.dst_SE3_object.compose(c0.dst_SE3_object.inverse())
-                obj_flow = c1_SE3_c0.transform_point_cloud(obj_pts) - obj_pts
-                flow[obj_mask] = obj_flow.astype(np.float32)
-            else:
-                valid[obj_mask] = 0
-        return flow, classes, valid, ego1_SE3_ego0
-
-    sweeps = [Sweep.from_feather(data_dir / log_id / "sensors" / "lidar" / f"{ts}.feather") for ts in timestamps]
-
-    # ================== Load annotations ==================
-    annotations_feather_path = data_dir / log_id / "annotations.feather"
-    
-    if not annotations_feather_path.exists():
-        # print(f'{annotations_feather_path} not found')
-        timestamp_cuboid_index = {}
-    else:
-        # Load annotations from disk.
-        # NOTE: This file contains annotations for the ENTIRE sequence.
-        # The sweep annotations are selected below.
-        cuboid_list = CuboidList.from_feather(annotations_feather_path)
-
-        raw_data = read_feather(annotations_feather_path)
-        ids = raw_data.track_uuid.to_numpy()
-        timestamp_cuboid_index = defaultdict(dict)
-        for id, cuboid in zip(ids, cuboid_list.cuboids):
-            timestamp_cuboid_index[cuboid.timestamp_ns][id] = cuboid
-    # ================== Load annotations ==================
-
-    cuboids = [timestamp_cuboid_index.get(ts, {}) for ts in timestamps]
-
-    log_poses_df = read_feather(data_dir / log_id / "city_SE3_egovehicle.feather")
-    filtered_log_poses_df = log_poses_df[log_poses_df["timestamp_ns"].isin(timestamps)]
-    poses = [convert_pose_dataframe_to_SE3(filtered_log_poses_df.loc[filtered_log_poses_df["timestamp_ns"] == ts]) for ts in timestamps]
-
-    flow_0_1, classes_0, valid_0, ego_motion = compute_flow(sweeps, cuboids, poses)
-
-    return {'pcl_0': sweeps[0].xyz, 'pcl_1' :sweeps[1].xyz, 'flow_0_1': flow_0_1,
-            'valid_0': valid_0, 'classes_0': classes_0, 
-            'pose_0': poses[0], 'pose_1': poses[1],
-            'ego_motion': ego_motion}
-
-def process_log(data_dir: Path, log_id: str, output_dir: Path, n: Optional[int] = None) :
-
-    def create_group_data(group, pc, gm, pose, flow_0to1=None, flow_valid=None, flow_category=None, ego_motion=None):
-        group.create_dataset('lidar', data=pc.astype(np.float32))
-        group.create_dataset('ground_mask', data=gm.astype(bool))
-        group.create_dataset('pose', data=pose.astype(np.float32))
-        if flow_0to1 is not None:
-            # ground truth flow information
-            group.create_dataset('flow', data=flow_0to1.astype(np.float32))
-            group.create_dataset('flow_is_valid', data=flow_valid.astype(bool))
-            group.create_dataset('flow_category_indices', data=flow_category.astype(np.uint8))
-            group.create_dataset('ego_motion', data=ego_motion.astype(np.float32))
-
-    log_map_dirpath = data_dir / log_id / "map"
-    if(len(os.listdir(log_map_dirpath))<3):
-        print(f'{log_map_dirpath} needed by 3 to find the ground layer, check if you are using the correct *sensor* dataset')
-        print("If you are using *lidar* dataset, Please run the following command to generate the map files:")
-        print(f"python run_steps/0_additional_lidar_map.py --argo_dir {data_dir}")
-        return
-    avm = ArgoverseStaticMap.from_map_dir(log_map_dirpath, build_raster=True)
-
-    timestamps = sorted([int(file.replace('.feather', ''))
-                        for file in os.listdir(data_dir / log_id / "sensors/lidar")
-                        if file.endswith('.feather')])
-
-    gt_flow_flag = False if not (data_dir / log_id / "annotations.feather").exists() else True
-
-    # if n is not None:
-    #     iter_bar = tqdm(zip(timestamps, timestamps[1:]), leave=False,
-    #                      total=len(timestamps) - 1, position=n,
-    #                      desc=f'Log {log_id}')
-    # else:
-    #     iter_bar = zip(timestamps, timestamps[1:])
-
-    with h5py.File(output_dir/f'{log_id}.h5', 'a') as f:
-        for cnt, ts0 in enumerate(timestamps):
-            group = f.create_group(str(ts0))
-            pc0, pose0, is_ground_0 = read_pose_pc_ground(data_dir, log_id, ts0, avm)
-            if pc0.shape[0] < 256:
-                print(f'{log_id}/{ts0} has less than 256 points, skip this scenarios. Please check the data if needed.')
-                break
-            if cnt == len(timestamps) - 1 or not gt_flow_flag:
-                create_group_data(group, pc0, is_ground_0.astype(np.bool_), pose0.transform_matrix.astype(np.float32))
-            else:
-                ts1 = timestamps[cnt + 1]
-                scene_flow = compute_sceneflow(data_dir, log_id, (ts0, ts1))
-                create_group_data(group, pc0, is_ground_0.astype(np.bool_), pose0.transform_matrix.astype(np.float32),
-                                  scene_flow['flow_0_1'], scene_flow['valid_0'], scene_flow['classes_0'],
-                                  scene_flow['ego_motion'].transform_matrix.astype(np.float32))
-
-def proc(x, ignore_current_process=False):
-    if not ignore_current_process:
-        current=current_process()
-        pos = current._identity[0]
-    else:
-        pos = 1
-    process_log(*x, n=pos)
-    
-def process_logs(data_dir: Path, output_dir: Path, nproc: int):
-    """Compute sceneflow for all logs in the dataset. Logs are processed in parallel.
-       Args:
-         data_dir: Argoverse 2.0 directory
-         output_dir: Output directory.
-    """
-    
-    if not data_dir.exists():
-        print(f'{data_dir} not found')
-        return
-    
-    # NOTE(Qingwen): if you don't want to all data_dir, then change here: logs = logs[:10] only 10 scene.
-    logs = os.listdir(data_dir)
-    args = sorted([(data_dir, log, output_dir) for log in logs])
-    print(f'Using {nproc} processes to process data: {data_dir} to .h5 format. (#scenes: {len(args)})')
-    # for debug
-    # for x in tqdm(args):
-    #     proc(x, ignore_current_process=True)
-    #     break
-    if nproc <= 1:
-        for x in tqdm(args, ncols=120):
-            proc(x, ignore_current_process=True)
-    else:
-        with Pool(processes=nproc) as p:
-            res = list(tqdm(p.imap_unordered(proc, args), total=len(logs), ncols=120))
-
-def main(
-    argo_dir: str = "/home/kin/data/av2",
-    output_dir: str ="/home/kin/data/av2/preprocess",
-    av2_type: str = "sensor",
-    data_mode: str = "val",
-    mask_dir: str = "/home/kin/data/av2/3d_scene_flow",
-    nproc: int = (multiprocessing.cpu_count() - 1),
-    only_index: bool = False,
-):
-    data_root_ = Path(argo_dir) / av2_type/ data_mode
-    output_dir_ = Path(output_dir) / av2_type / data_mode
-    if only_index:
-        create_reading_index(output_dir_)
-        return
-    output_dir_.mkdir(exist_ok=True, parents=True)
-    process_logs(data_root_, output_dir_, nproc)
-    create_reading_index(output_dir_)
-    if data_mode == "val" or data_mode == "test":
-        create_eval_mask(data_mode, output_dir_, mask_dir)
-
-if __name__ == '__main__':
-    start_time = time.time()
-    fire.Fire(main)
-    print(f"\nTime used: {(time.time() - start_time)/60:.2f} mins")
\ No newline at end of file
diff --git a/dataprocess/extract_nus.py b/dataprocess/extract_nus.py
deleted file mode 100644
index e69de29..0000000
diff --git a/dataprocess/extract_waymo.py b/dataprocess/extract_waymo.py
deleted file mode 100644
index 01598a0..0000000
--- a/dataprocess/extract_waymo.py
+++ /dev/null
@@ -1,425 +0,0 @@
-"""
-# Created: 2024-02-26 12:42
-# Copyright (C) 2024-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/), Tianshuai Hu (thuaj@connect.ust.hk)
-# 
-# This file is part of SeFlow (https://github.com/KTH-RPL/SeFlow).
-# If you find this repo helpful, please cite the respective publication as 
-# listed on the above website.
-# 
-# Description: Preprocess Data, save as h5df format for faster loading
-# This one is for Waymo dataset, refer a lot to 
-# Kyle's ZeroFlow code: https://github.com/kylevedder/zeroflow/tree/master/data_prep_scripts/waymo
-
-# env need: pip install waymo-open-dataset-tf-2.11.0==1.5.0
-"""
-
-import os
-os.environ["OMP_NUM_THREADS"] = "1"
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
-# NOTE(2023/02/29): it's really important to set this! otherwise, the point cloud will be wrong. really wried.
-os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
-
-import multiprocessing
-from multiprocessing import Pool, current_process
-from pathlib import Path
-from tqdm import tqdm
-import numpy as np
-import fire, time, h5py
-
-from waymo_open_dataset import dataset_pb2
-from waymo_open_dataset.utils import frame_utils
-
-import tensorflow as tf
-
-import os, sys, json
-BASE_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '..' ))
-sys.path.append(BASE_DIR)
-from dataprocess.misc_data import create_reading_index, SE2
-GROUND_HEIGHT_THRESHOLD = 0.4  # 40 centimeters
-RANGE_MAX_VALID = 50
-
-def is_ground_points(
-    raster_heightmap,
-    global_to_raster_se2,
-    global_to_raster_scale,
-    global_point_cloud,
-) -> np.ndarray:
-    """Remove ground points from a point cloud.
-    Args:
-        point_cloud: Numpy array of shape (k,3) in global coordinates.
-    Returns:
-        ground_removed_point_cloud: Numpy array of shape (k,3) in global coordinates.
-    """
-    def get_ground_heights(
-        raster_heightmap,
-        global_to_raster_se2,
-        global_to_raster_scale,
-        global_point_cloud,
-    ) -> np.ndarray:
-        """Get ground height for each of the xy locations in a point cloud.
-        Args:
-            point_cloud: Numpy array of shape (k,2) or (k,3) in global coordinates.
-        Returns:
-            ground_height_values: Numpy array of shape (k,)
-        """
-
-        global_points_xy = global_point_cloud[:, :2]
-
-        raster_points_xy = (
-            global_to_raster_se2.transform_point_cloud(global_points_xy) * global_to_raster_scale
-        )
-
-        raster_points_xy = np.round(raster_points_xy).astype(np.int64)
-
-        ground_height_values = np.full((raster_points_xy.shape[0]), np.nan)
-        # outside max X
-        outside_max_x = (raster_points_xy[:, 0] >= raster_heightmap.shape[1]).astype(bool)
-        # outside max Y
-        outside_max_y = (raster_points_xy[:, 1] >= raster_heightmap.shape[0]).astype(bool)
-        # outside min X
-        outside_min_x = (raster_points_xy[:, 0] < 0).astype(bool)
-        # outside min Y
-        outside_min_y = (raster_points_xy[:, 1] < 0).astype(bool)
-        ind_valid_pts = ~np.logical_or(
-            np.logical_or(outside_max_x, outside_max_y),
-            np.logical_or(outside_min_x, outside_min_y),
-        )
-
-        ground_height_values[ind_valid_pts] = raster_heightmap[
-            raster_points_xy[ind_valid_pts, 1], raster_points_xy[ind_valid_pts, 0]
-        ]
-
-        return ground_height_values
-    ground_height_values = get_ground_heights(
-        raster_heightmap,
-        global_to_raster_se2,
-        global_to_raster_scale,
-        global_point_cloud,
-    )
-    is_ground_boolean_arr = (
-        np.absolute(global_point_cloud[:, 2] - ground_height_values) <= GROUND_HEIGHT_THRESHOLD
-    ) | (np.array(global_point_cloud[:, 2] - ground_height_values) < 0)
-    return is_ground_boolean_arr
-
-def convert_range_image_to_point_cloud(
-    frame,
-    range_images,
-    camera_projections,
-    point_flows,
-    range_image_top_pose,
-    ri_index=0,
-    keep_polar_features=False,
-):
-    """Convert range images to point cloud.
-
-    Args:
-      frame: open dataset frame
-      range_images: A dict of {laser_name, [range_image_first_return,
-        range_image_second_return]}.
-      camera_projections: A dict of {laser_name,
-        [camera_projection_from_first_return,
-        camera_projection_from_second_return]}.
-      range_image_top_pose: range image pixel pose for top lidar.
-      ri_index: 0 for the first return, 1 for the second return.
-      keep_polar_features: If true, keep the features from the polar range image
-        (i.e. range, intensity, and elongation) as the first features in the
-        output range image.
-
-    Returns:
-      points: {[N, 3]} list of 3d lidar points of length 5 (number of lidars).
-        (NOTE: Will be {[N, 6]} if keep_polar_features is true.
-      cp_points: {[N, 6]} list of camera projections of length 5
-        (number of lidars).
-    """
-    calibrations = sorted(frame.context.laser_calibrations, key=lambda c: c.name)
-    points = []
-    cp_points = []
-    flows = []
-
-    cartesian_range_images = frame_utils.convert_range_image_to_cartesian(
-        frame, range_images, range_image_top_pose, ri_index, keep_polar_features
-    )
-
-    for c in calibrations:
-        range_image = range_images[c.name][ri_index]
-        range_image_tensor = tf.reshape(
-            tf.convert_to_tensor(value=range_image.data), range_image.shape.dims
-        )
-        range_image_mask = range_image_tensor[..., 0] > 0
-
-        range_image_cartesian = cartesian_range_images[c.name]
-        points_tensor = tf.gather_nd(range_image_cartesian, tf.compat.v1.where(range_image_mask))
-
-        flow = point_flows[c.name][ri_index]
-        flow_tensor = tf.reshape(tf.convert_to_tensor(value=flow.data), flow.shape.dims)
-        flow_points_tensor = tf.gather_nd(flow_tensor, tf.compat.v1.where(range_image_mask))
-
-        cp = camera_projections[c.name][ri_index]
-        cp_tensor = tf.reshape(tf.convert_to_tensor(value=cp.data), cp.shape.dims)
-        cp_points_tensor = tf.gather_nd(cp_tensor, tf.compat.v1.where(range_image_mask))
-
-        points.append(points_tensor.numpy())
-        cp_points.append(cp_points_tensor.numpy())
-        flows.append(flow_points_tensor.numpy())
-
-    return points, cp_points, flows
-
-def parse_range_image_and_camera_projection(frame):
-    """
-      Parse range images and camera projections given a frame.
-
-    Args:
-       frame: open dataset frame proto
-
-    Returns:
-       range_images: A dict of {laser_name,
-         [range_image_first_return, range_image_second_return]}.
-       camera_projections: A dict of {laser_name,
-         [camera_projection_from_first_return,
-          camera_projection_from_second_return]}.
-      range_image_top_pose: range image pixel pose for top lidar.
-    """
-    range_images = {}
-    camera_projections = {}
-    point_flows = {}
-    range_image_top_pose = None
-    for laser in frame.lasers:
-        if (
-            len(laser.ri_return1.range_image_compressed) > 0
-        ):  # pylint: disable=g-explicit-length-test
-            range_image_str_tensor = tf.io.decode_compressed(
-                laser.ri_return1.range_image_compressed, "ZLIB"
-            )
-            ri = dataset_pb2.MatrixFloat()
-            ri.ParseFromString(bytearray(range_image_str_tensor.numpy()))
-            range_images[laser.name] = [ri]
-
-            if len(laser.ri_return1.range_image_flow_compressed) > 0:
-                range_image_flow_str_tensor = tf.io.decode_compressed(
-                    laser.ri_return1.range_image_flow_compressed, "ZLIB"
-                )
-                ri = dataset_pb2.MatrixFloat()
-                ri.ParseFromString(bytearray(range_image_flow_str_tensor.numpy()))
-                point_flows[laser.name] = [ri]
-
-            if laser.name == dataset_pb2.LaserName.TOP:
-                range_image_top_pose_str_tensor = tf.io.decode_compressed(
-                    laser.ri_return1.range_image_pose_compressed, "ZLIB"
-                )
-                range_image_top_pose = dataset_pb2.MatrixFloat()
-                range_image_top_pose.ParseFromString(
-                    bytearray(range_image_top_pose_str_tensor.numpy())
-                )
-
-            camera_projection_str_tensor = tf.io.decode_compressed(
-                laser.ri_return1.camera_projection_compressed, "ZLIB"
-            )
-            cp = dataset_pb2.MatrixInt32()
-            cp.ParseFromString(bytearray(camera_projection_str_tensor.numpy()))
-            camera_projections[laser.name] = [cp]
-        if (
-            len(laser.ri_return2.range_image_compressed) > 0
-        ):  # pylint: disable=g-explicit-length-test
-            range_image_str_tensor = tf.io.decode_compressed(
-                laser.ri_return2.range_image_compressed, "ZLIB"
-            )
-            ri = dataset_pb2.MatrixFloat()
-            ri.ParseFromString(bytearray(range_image_str_tensor.numpy()))
-            range_images[laser.name].append(ri)
-
-            if len(laser.ri_return2.range_image_flow_compressed) > 0:
-                range_image_flow_str_tensor = tf.io.decode_compressed(
-                    laser.ri_return2.range_image_flow_compressed, "ZLIB"
-                )
-                ri = dataset_pb2.MatrixFloat()
-                ri.ParseFromString(bytearray(range_image_flow_str_tensor.numpy()))
-                point_flows[laser.name].append(ri)
-
-            camera_projection_str_tensor = tf.io.decode_compressed(
-                laser.ri_return2.camera_projection_compressed, "ZLIB"
-            )
-            cp = dataset_pb2.MatrixInt32()
-            cp.ParseFromString(bytearray(camera_projection_str_tensor.numpy()))
-            camera_projections[laser.name].append(cp)
-    return range_images, camera_projections, point_flows, range_image_top_pose
-
-def get_car_pc_global_pc_flow_transform(frame: dataset_pb2.Frame):
-
-    # Parse the frame lidar data into range images.
-    range_images, camera_projections, point_flows, range_image_top_poses = parse_range_image_and_camera_projection(frame)
-
-    # Project the range images into points.
-    points_lst, cp_points, flows_lst = convert_range_image_to_point_cloud(
-        frame,
-        range_images,
-        camera_projections,
-        point_flows,
-        range_image_top_poses,
-        keep_polar_features=True)
-
-    car_frame_pc = points_lst[0][:, 3:]
-    car_frame_flows = flows_lst[0][:, :3]
-    car_frame_labels = flows_lst[0][:, 3]
-    num_points = car_frame_pc.shape[0]
-
-    # # Transform the points from the vehicle frame to the world frame.
-    world_frame_pc = np.concatenate([car_frame_pc, np.ones([num_points, 1])], axis=-1)
-    car_to_global_transform = np.reshape(np.array(frame.pose.transform), [4, 4])
-    world_frame_pc = np.transpose(np.matmul(car_to_global_transform, np.transpose(world_frame_pc)))[:, :3]
-
-    # # Transform the points from the world frame to the map frame.
-    offset = frame.map_pose_offset
-    points_offset = np.array([offset.x, offset.y, offset.z])
-    world_frame_pc += points_offset
-    
-    calibrations = sorted(frame.context.laser_calibrations, key=lambda c: c.name)
-    for c in calibrations:
-        if calibrations[0].name == dataset_pb2.LaserName.TOP:
-            break
-    ego2sensor = np.reshape(np.array(c.extrinsic.transform), [4, 4])
-    car_frame_pc[:, 2] = car_frame_pc[:, 2] - ego2sensor[2, 3] # move center to sensor height
-    return (
-        car_frame_pc,
-        world_frame_pc,
-        car_frame_flows,
-        car_frame_labels,
-        car_to_global_transform,
-    )
-
-def load_ground_height_raster(map_path: Path):
-
-    raster_heightmap = np.load(map_path / "ground_height.npy")
-    transform = json.load(open(map_path / "se2.json"))
-
-    transform_rotation = np.array(transform["R"]).reshape(2, 2)
-    transform_translation = np.array(transform["t"])
-    transform_scale = np.array(transform["s"])
-
-    transform_se2 = SE2(rotation=transform_rotation, translation=transform_translation)
-
-    return raster_heightmap, transform_se2, transform_scale
-
-def process_log(data_dir: Path, log, log_map_folder, output_dir: Path, n = None) :
-
-    def create_group_data(group, pc, pose, gm = None, flow_0to1=None, flow_valid=None, flow_category=None, ego_motion=None):
-        group.create_dataset('lidar', data=pc.astype(np.float32))
-        group.create_dataset('pose', data=pose.astype(np.float64))
-        if ego_motion is not None:
-            group.create_dataset('ego_motion', data=ego_motion.astype(np.float32))
-        if gm is not None:
-            group.create_dataset('ground_mask', data=gm.astype(bool))
-        if flow_0to1 is not None:
-            group.create_dataset('flow', data=flow_0to1.astype(np.float32))
-            flow_valid = np.ones_like(gm)
-            flow_valid[gm] = 0
-            INSIDE_RANGE = np.logical_and(pc[:, 0] < RANGE_MAX_VALID, pc[:, 0] > -RANGE_MAX_VALID) & np.logical_and(pc[:, 1] < RANGE_MAX_VALID, pc[:, 1] > -RANGE_MAX_VALID)
-            flow_valid[~INSIDE_RANGE] = 0
-            group.create_dataset('flow_is_valid', data=flow_valid.astype(bool))
-            # From the Waymo Open dataset.proto:
-            # // If the point is not annotated with scene flow information, class is set
-            # // to -1. A point is not annotated if it is in a no-label zone or if its label
-            # // bounding box does not have a corresponding match in the previous frame,
-            # // making it infeasible to estimate the motion of the point.
-            # // Otherwise, (vx, vy, vz) are velocity along (x, y, z)-axis for this point
-            # // and class is set to one of the following values:
-            # //  -1: no-flow-label, the point has no flow information.
-            # //   0:  unlabeled or "background,", i.e., the point is not contained in a
-            # //       bounding box.
-            # //   1: vehicle, i.e., the point corresponds to a vehicle label box.
-            # //   2: pedestrian, i.e., the point corresponds to a pedestrian label box.
-            # //   3: sign, i.e., the point corresponds to a sign label box.
-            # //   4: cyclist, i.e., the point corresponds to a cyclist label box.
-            # replace all -1 and 0 to 0
-            flow_category[flow_category < 0] = 0 # NONE, Background
-            # replace 1 to 19, since av2 index 19 is for REGULAR_VEHICLE
-            flow_category[flow_category == 1] = 19
-            # replace 2 to 17, since av2 index 16 is for PEDESTRIAN
-            flow_category[flow_category == 2] = 17
-            # replace 3 to 21, since av2 index 21 is for SIGN
-            flow_category[flow_category == 3] = 21
-            # no replace 4 to 4, since av2 index 4 is for BICYCLIST
-            group.create_dataset('flow_category_indices', data=flow_category.astype(np.int8))
-    
-    raster_heightmap, transform_se2, transform_scale = load_ground_height_raster(log_map_folder.parent / log_map_folder.stem)
-    all_data = list(tf.data.TFRecordDataset(data_dir / log, compression_type='').as_numpy_iterator())
-    first_frame = dataset_pb2.Frame.FromString(bytearray(all_data[0]))
-    scene_id = first_frame.context.name
-    total_lens = len(all_data)
-    for data_idx in range(1, total_lens):
-        if data_idx >= total_lens - 2:
-            # 0: no correct flow label, end(total_lens - 1) - 1: no correct pose flow
-            continue
-        frame = dataset_pb2.Frame.FromString(bytearray(all_data[data_idx]))
-        if scene_id != frame.context.name:
-            print(f"Scene ID mismatch: {scene_id} vs {frame.context.name}")
-            break
-        car_frame_pc, global_frame_pc, flow, label, pose = get_car_pc_global_pc_flow_transform(frame)
-        _, _, _, _, pose1 = get_car_pc_global_pc_flow_transform(dataset_pb2.Frame.FromString(bytearray(all_data[data_idx+1])))
-        ego_motion = np.linalg.inv(pose1) @ pose
-        pose_flow = car_frame_pc[:, :3] @ ego_motion[:3, :3].T + ego_motion[:3, 3] - car_frame_pc[:, :3]
-        ground_mask = is_ground_points(raster_heightmap, transform_se2, transform_scale, global_frame_pc)
-        timestamp = frame.timestamp_micros
-        if car_frame_pc.shape[0] < 256:
-            print(f'{scene_id}/{timestamp} has less than 256 points, skip this scenarios. Please check the data if needed.')
-            break
-        with h5py.File(output_dir/f'{scene_id}.h5', 'a') as f:
-            group = f.create_group(str(timestamp))
-            create_group_data(group, car_frame_pc, pose, ego_motion=ego_motion, gm=np.array(ground_mask), flow_0to1=(flow/10.0+pose_flow), flow_category=label)
-        # if data_idx > 10:
-        #     break
-
-def proc(x, ignore_current_process=False):
-    if not ignore_current_process:
-        current=current_process()
-        pos = current._identity[0]
-    else:
-        pos = 1
-    process_log(*x, n=pos)
-
-def process_logs(data_dir: Path, map_dir: Path, output_dir: Path, nproc: int):
-    """Compute sceneflow for all logs in the dataset. Logs are processed in parallel.
-       Args:
-         data_dir: Argoverse 2.0 directory
-         output_dir: Output directory.
-    """
-    if not (data_dir).exists():
-        print(f'{data_dir} not found')
-        return
-    
-    logs = sorted(os.listdir(data_dir))
-    args = sorted([(data_dir, log, map_dir/log, output_dir) for log in logs])
-    print(f'Using {nproc} processes to process {len(args)} logs.')
-    
-    # # for debug
-    # for x in tqdm(args):
-    #     proc(x, ignore_current_process=True)
-    #     break
-
-    if nproc <= 1:
-        for x in tqdm(args):
-            proc(x, ignore_current_process=True)
-    else:
-        with Pool(processes=nproc) as p:
-            res = list(tqdm(p.imap_unordered(proc, args), total=len(args), ncols=100))
-
-def main(
-    flow_data_dir: str = "/home/kin/data/waymo/flowlabel",
-    mode: str = "test",
-    map_dir: str = "/home/kin/data/waymo/flowlabel/map",
-    output_dir: str ="/home/kin/data/waymo/flowlabel/preprocess",
-    nproc: int = (multiprocessing.cpu_count() - 1),
-    create_index_only: bool = False,
-):
-    output_dir_ = Path(output_dir) / mode
-    if create_index_only:
-        create_reading_index(Path(output_dir_))
-        return
-    output_dir_.mkdir(exist_ok=True, parents=True)
-    process_logs(Path(flow_data_dir) / mode, Path(map_dir), output_dir_, nproc)
-    create_reading_index(output_dir_)
-
-if __name__ == '__main__':
-    start_time = time.time()
-    fire.Fire(main)
-    print(f"\nTime used: {(time.time() - start_time)/60:.2f} mins")
\ No newline at end of file
diff --git a/dataprocess/misc_data.py b/dataprocess/misc_data.py
deleted file mode 100644
index 9698a77..0000000
--- a/dataprocess/misc_data.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import numpy as np
-import pickle, h5py, os, time
-from pathlib import Path
-from tqdm import tqdm
-
-def create_reading_index(data_dir: Path):
-    start_time = time.time()
-    data_index = []
-    for file_name in tqdm(os.listdir(data_dir), ncols=100):
-        if not file_name.endswith(".h5"):
-            continue
-        scene_id = file_name.split(".")[0]
-        timestamps = []
-        with h5py.File(data_dir/file_name, 'r') as f:
-            timestamps.extend(f.keys())
-        timestamps.sort(key=lambda x: int(x)) # make sure the timestamps are in order
-        for timestamp in timestamps:
-            data_index.append([scene_id, timestamp])
-
-    with open(data_dir/'index_total.pkl', 'wb') as f:
-        pickle.dump(data_index, f)
-        print(f"Create reading index Successfully, cost: {time.time() - start_time:.2f} s")
-
-class SE2:
-
-    def __init__(self, rotation: np.ndarray, translation: np.ndarray) -> None:
-        """Initialize.
-        Args:
-            rotation: np.ndarray of shape (2,2).
-            translation: np.ndarray of shape (2,1).
-        Raises:
-            ValueError: if rotation or translation do not have the required shapes.
-        """
-        assert rotation.shape == (2, 2)
-        assert translation.shape == (2, )
-        self.rotation = rotation
-        self.translation = translation
-        self.transform_matrix = np.eye(3)
-        self.transform_matrix[:2, :2] = self.rotation
-        self.transform_matrix[:2, 2] = self.translation
-
-    def transform_point_cloud(self, point_cloud: np.ndarray) -> np.ndarray:
-        """Apply the SE(2) transformation to point_cloud.
-        Args:
-            point_cloud: np.ndarray of shape (N, 2).
-        Returns:
-            transformed_point_cloud: np.ndarray of shape (N, 2).
-        Raises:
-            ValueError: if point_cloud does not have the required shape.
-        """
-        assert point_cloud.ndim == 2
-        assert point_cloud.shape[1] == 2
-        num_points = point_cloud.shape[0]
-        homogeneous_pts = np.hstack([point_cloud, np.ones((num_points, 1))])
-        transformed_point_cloud = homogeneous_pts.dot(self.transform_matrix.T)
-        return transformed_point_cloud[:, :2]
-
-    def inverse(self) -> "SE2":
-        """Return the inverse of the current SE2 transformation.
-        For example, if the current object represents target_SE2_src, we will return instead src_SE2_target.
-        Returns:
-            inverse of this SE2 transformation.
-        """
-        return SE2(rotation=self.rotation.T,
-                   translation=self.rotation.T.dot(-self.translation))
-
-    def inverse_transform_point_cloud(self,
-                                      point_cloud: np.ndarray) -> np.ndarray:
-        """Transform the point_cloud by the inverse of this SE2.
-        Args:
-            point_cloud: Numpy array of shape (N,2).
-        Returns:
-            point_cloud transformed by the inverse of this SE2.
-        """
-        return self.inverse().transform_point_cloud(point_cloud)
-
-    def compose(self, right_se2: "SE2") -> "SE2":
-        """Multiply this SE2 from right by right_se2 and return the composed transformation.
-        Args:
-            right_se2: SE2 object to multiply this object by from right.
-        Returns:
-            The composed transformation.
-        """
-        chained_transform_matrix = self.transform_matrix.dot(
-            right_se2.transform_matrix)
-        chained_se2 = SE2(
-            rotation=chained_transform_matrix[:2, :2],
-            translation=chained_transform_matrix[:2, 2],
-        )
-        return chained_se2
-    
\ No newline at end of file
diff --git a/environment.yaml b/environment.yaml
deleted file mode 100644
index 2d54225..0000000
--- a/environment.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: seflow
-channels:
-  - conda-forge
-  - pytorch
-dependencies:
-  - python=3.8
-  - pytorch::pytorch=2.0.0
-  - pytorch::torchvision
-  - pytorch::pytorch-cuda=11.7
-  - nvidia/label/cuda-11.7.0::cuda
-  - lightning==2.0.1
-  - mkl==2024.0.0
-  - numba
-  - numpy
-  - pandas
-  - pip
-  - scipy
-  - tqdm
-  - h5py
-  - wandb
-  - omegaconf
-  - hydra-core
-  - fire
-  - tabulate
-  - scikit-learn==1.3.2
-  - hdbscan
-  - setuptools==69.5.1
-  - pip:
-    - open3d==0.18.0
-    - dztimer
-    - av2==0.2.1
-    - dufomap==1.0.0
-
-# Reason about the version fixed:
-# setuptools==68.5.1: https://github.com/aws-neuron/aws-neuron-sdk/issues/893
-# mkl==2024.0.0: https://github.com/pytorch/pytorch/issues/123097#issue-2218541307
-# av2==0.2.1: in case other version deleted some functions.
-# lightning==2.0.1: https://stackoverflow.com/questions/76647518/how-to-fix-error-cannot-import-name-modelmetaclass-from-pydantic-main
-# open3d==0.18.0: because 0.17.0 have bug on set the view json file
-# dufomap==1.0.0: in case later updating may not compatible with the code.
\ No newline at end of file
diff --git a/envprocess.yaml b/envprocess.yaml
deleted file mode 100644
index 2d676a3..0000000
--- a/envprocess.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: dataprocess
-channels:
-  - conda-forge
-  - pytorch
-dependencies:
-  - python=3.8
-  - pytorch::pytorch=2.0.0
-  - pytorch::torchvision
-  - mkl==2024.0.0
-  - numba
-  - numpy
-  - pandas
-  - pip
-  - scipy
-  - tqdm
-  - fire
-  - hdbscan
-  - s5cmd
-  - pip:
-    - nuscenes-devkit
-    - av2==0.2.1
-    - waymo-open-dataset-tf-2.11.0==1.5.0
-    - dufomap==1.0.0
-    - linefit
-    - dztimer
-
-# Reason about the version fixed:
-# numpy==1.22: package conflicts, need numpy higher or same 1.22
-# mkl==2024.0.0: https://github.com/pytorch/pytorch/issues/123097
\ No newline at end of file
diff --git a/eval.py b/eval.py
deleted file mode 100644
index c2449a5..0000000
--- a/eval.py
+++ /dev/null
@@ -1,66 +0,0 @@
-"""
-# Created: 2023-08-09 10:28
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-#
-# This file is part of DeFlow (https://github.com/KTH-RPL/DeFlow).
-# If you find this repo helpful, please cite the respective publication as 
-# listed on the above website.
-
-# Description: Output the evaluation results, go for local evaluation or online evaluation
-"""
-
-import torch
-from torch.utils.data import DataLoader
-import lightning.pytorch as pl
-from lightning.pytorch.loggers import WandbLogger
-from omegaconf import DictConfig
-import hydra, wandb, os, sys
-from hydra.core.hydra_config import HydraConfig
-from src.dataset import HDF5Dataset
-from src.trainer import ModelWrapper
-
-def precheck_cfg_valid(cfg):
-    if os.path.exists(cfg.dataset_path + f"/{cfg.av2_mode}") is False:
-        raise ValueError(f"Dataset {cfg.dataset_path}/{cfg.av2_mode} does not exist. Please check the path.")
-    if cfg.supervised_flag not in [True, False]:
-        raise ValueError(f"Supervised flag {cfg.supervised_flag} is not valid. Please set it to True or False.")
-    if cfg.leaderboard_version not in [1, 2]:
-        raise ValueError(f"Leaderboard version {cfg.leaderboard_version} is not valid. Please set it to 1 or 2.")
-    return cfg
-
-@hydra.main(version_base=None, config_path="conf", config_name="eval")
-def main(cfg):
-    pl.seed_everything(cfg.seed, workers=True)
-    output_dir = HydraConfig.get().runtime.output_dir
-    
-    if not os.path.exists(cfg.checkpoint):
-        print(f"Checkpoint {cfg.checkpoint} does not exist. Need checkpoints for evaluation.")
-        sys.exit(1)
-        
-    torch_load_ckpt = torch.load(cfg.checkpoint)
-    checkpoint_params = DictConfig(torch_load_ckpt["hyper_parameters"])
-    cfg.output = checkpoint_params.cfg.output + f"-e{torch_load_ckpt['epoch']}-{cfg.av2_mode}-v{cfg.leaderboard_version}"
-    cfg.model.update(checkpoint_params.cfg.model)
-    
-    mymodel = ModelWrapper.load_from_checkpoint(cfg.checkpoint, cfg=cfg, eval=True)
-    print(f"\n---LOG[eval]: Loaded model from {cfg.checkpoint}. The backbone network is {checkpoint_params.cfg.model.name}.\n")
-
-    wandb_logger = WandbLogger(save_dir=output_dir,
-                               entity="kth-rpl",
-                               project=f"deflow-eval", 
-                               name=f"{cfg.output}",
-                               offline=(cfg.wandb_mode == "offline"))
-    
-    trainer = pl.Trainer(logger=wandb_logger, devices=1)
-    # NOTE(Qingwen): search & check: def eval_only_step_(self, batch, res_dict)
-    trainer.validate(model = mymodel, \
-                     dataloaders = DataLoader( \
-                                            HDF5Dataset(cfg.dataset_path + f"/{cfg.av2_mode}", \
-                                                        n_frames=checkpoint_params.cfg.num_frames  if 'num_frames' in checkpoint_params.cfg else 2, \
-                                                        eval=True, leaderboard_version=cfg.leaderboard_version), \
-                                            batch_size=1, shuffle=False))
-    wandb.finish()
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/logs/.gitkeep b/logs/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/src/lossfuncs.py b/lossfunc.py
similarity index 100%
rename from src/lossfuncs.py
rename to lossfunc.py
diff --git a/process.py b/process.py
deleted file mode 100644
index 734c68a..0000000
--- a/process.py
+++ /dev/null
@@ -1,153 +0,0 @@
-"""
-# Created: 2023-11-04 15:55
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-#
-# This file is part of SeFlow (https://github.com/KTH-RPL/SeFlow).
-# If you find this repo helpful, please cite the respective publication as 
-# listed on the above website.
-# 
-# Description: run dufomap on the dataset we preprocessed for afterward ssl training.
-#              it's only needed for ssl train but not inference.
-#              Goal to segment dynamic and static point roughly.
-"""
-
-from pathlib import Path
-from tqdm import tqdm
-import numpy as np
-import fire, time, h5py, os
-from hdbscan import HDBSCAN
-
-from src.utils.mics import HDF5Data, transform_to_array
-from dufomap import dufomap
-
-MIN_AXIS_RANGE = 2 # HARD CODED: remove ego vehicle points
-MAX_AXIS_RANGE = 50 # HARD CODED: remove far away points
-
-def run_cluster(
-    data_dir: str ="/home/kin/data/av2/preprocess/sensor/train",
-    scene_range: list = [0, 1],
-    interval: int = 1, # useless here, just for the same interface args
-    overwrite: bool = False,
-):
-    data_path = Path(data_dir)
-    dataset = HDF5Data(data_path)
-    all_scene_ids = list(dataset.scene_id_bounds.keys())
-    for scene_in_data_index, scene_id in enumerate(all_scene_ids):
-        start_time = time.time()
-        # NOTE (Qingwen): so the scene id range is [start, end)
-        if scene_range[0]!= -1 and scene_range[-1]!= -1 and (scene_in_data_index < scene_range[0] or scene_in_data_index >= scene_range[1]):
-            continue
-        bounds = dataset.scene_id_bounds[scene_id]
-        flag_exist_label = True
-        with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
-            for ii in range(bounds["min_index"], bounds["max_index"]+1):
-                key = str(dataset[ii]['timestamp'])
-                if 'label' not in f[key]:
-                    flag_exist_label = False
-                    break
-        if flag_exist_label and not overwrite:
-            print(f"==> Scene {scene_id} has plus label, skip.")
-            continue
-        
-        hdb = HDBSCAN(min_cluster_size=20, cluster_selection_epsilon=0.7)
-        for i in tqdm(range(bounds["min_index"], bounds["max_index"]+1), desc=f"Start Plus Cluster: {scene_in_data_index}/{len(all_scene_ids)}", ncols=80):
-            data = dataset[i]
-            pc0 = data['pc0'][:,:3]
-            cluster_label = np.zeros(pc0.shape[0], dtype= np.int16)
-
-            if "dufo_label" not in data:
-                print(f"Warning: {scene_id} {data['timestamp']} has no dufo_label, will be skipped. Better to rerun dufomap again in this scene.")
-                continue
-            elif data["dufo_label"].sum() < 20:
-                print(f"Warning: {scene_id} {data['timestamp']} has no dynamic points, will be skipped. Better to check this scene.")
-            else:
-                hdb.fit(pc0[data["dufo_label"]==1])
-                # NOTE(Qingwen): since -1 will be assigned if no cluster. We set it to 0.
-                cluster_label[data["dufo_label"]==1] = hdb.labels_ + 1 
-
-            # save labels
-            timestamp = data['timestamp']
-            key = str(timestamp)
-            with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
-                if 'label' in f[key]:
-                    # print(f"Warning: {scene_id} {timestamp} has label, will be overwritten.")
-                    del f[key]['label']
-                f[key].create_dataset('label', data=np.array(cluster_label).astype(np.int16))
-        print(f"==> Scene {scene_id} finished, used: {(time.time() - start_time)/60:.2f} mins")
-    print(f"Data inside {str(data_path)} finished. Check the result with vis() function if you want to visualize them.")
-
-def run_dufo(
-    data_dir: str ="/home/kin/data/av2/preprocess/sensor/train",
-    scene_range: list = [0, 1],
-    interval: int = 1, # interval frames to run dufomap
-    overwrite: bool = False,
-):
-    data_path = Path(data_dir)
-    dataset = HDF5Data(data_path)
-    all_scene_ids = list(dataset.scene_id_bounds.keys())
-    for scene_in_data_index, scene_id in enumerate(all_scene_ids):
-        start_time = time.time()
-        # NOTE (Qingwen): so the scene id range is [start, end)
-        if scene_range[0]!= -1 and scene_range[-1]!= -1 and (scene_in_data_index < scene_range[0] or scene_in_data_index >= scene_range[1]):
-            continue
-        bounds = dataset.scene_id_bounds[scene_id]
-        flag_has_dufo_label = True
-        with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
-            for ii in range(bounds["min_index"], bounds["max_index"]+1):
-                key = str(dataset[ii]['timestamp'])
-                if "dufo_label" not in f[key]:
-                    flag_has_dufo_label = False
-                    break
-        if flag_has_dufo_label and not overwrite:
-            print(f"==> Scene {scene_id} has dufo_label, skip.")
-            continue
-
-        mydufo = dufomap(0.2, 0.2, 1, num_threads=12) # resolution, d_s, d_p, hit_extension
-        mydufo.setCluster(0, 20, 0.2) # depth=0, min_points=20, max_dist=0.2
-
-        print(f"==> Scene {scene_id} start, data path: {data_path}")
-        for i in tqdm(range(bounds["min_index"], bounds["max_index"]+1), desc=f"Dufo run: {scene_in_data_index}/{len(all_scene_ids)}", ncols=80):
-            if interval != 1 and i % interval != 0 and (i + interval//2 < bounds["max_index"] or i - interval//2 > bounds["min_index"]):
-                continue
-            data = dataset[i]
-            assert data['scene_id'] == scene_id, f"Check the data, scene_id {scene_id} is not consistent in {i}th data in {scene_in_data_index}th scene."
-            # HARD CODED: remove points outside the range
-            norm_pc0 = np.linalg.norm(data['pc0'][:, :3], axis=1)
-            range_mask = (
-                    (norm_pc0>MIN_AXIS_RANGE) & 
-                    (norm_pc0<MAX_AXIS_RANGE)
-            )
-            pose_array = transform_to_array(data['pose0'])
-            mydufo.run(data['pc0'][range_mask], pose_array, cloud_transform = True)
-
-        # finished integrate, start segment, needed since we have map.label inside dufo
-        mydufo.oncePropagateCluster(if_cluster = True, if_propagate=True)
-        for i in tqdm(range(bounds["min_index"], bounds["max_index"]+1), desc=f"Start Segment: {scene_in_data_index}/{len(all_scene_ids)}", ncols=80):
-            data = dataset[i]
-            pc0 = data['pc0']
-            gm0 = data['gm0']
-            pose_array = transform_to_array(data['pose0'])
-            dufo_label = np.array(mydufo.segment(pc0, pose_array, cloud_transform = True))
-            dufo_labels = np.zeros(pc0.shape[0], dtype= np.uint8)
-            dufo_labels[~gm0] = dufo_label[~gm0]
-
-            # save labels
-            timestamp = data['timestamp']
-            key = str(timestamp)
-            with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
-                if "dufo_label" in f[key]:
-                    # print(f"Warning: {scene_id} {timestamp} has label, will be overwritten.")
-                    del f[key]["dufo_label"]
-                f[key].create_dataset("dufo_label", data=np.array(dufo_labels).astype(np.uint8))
-        print(f"==> Scene {scene_id} finished, used: {(time.time() - start_time)/60:.2f} mins")
-    print(f"Data inside {str(data_path)} finished. Check the result with vis() function if you want to visualize them.")
-
-if __name__ == '__main__':
-    start_time = time.time()
-    # step 1: run dufomap
-    fire.Fire(run_dufo)
-    # step 2: run cluster on dufolabel
-    fire.Fire(run_cluster)
-
-    print(f"\nTime used: {(time.time() - start_time)/60:.2f} mins")
\ No newline at end of file
diff --git a/save.py b/save.py
deleted file mode 100644
index 00bf259..0000000
--- a/save.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""
-# Created: 2023-12-26 12:41
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-#
-# This file is part of DeFlow (https://github.com/KTH-RPL/DeFlow).
-# If you find this repo helpful, please cite the respective publication as 
-# listed on the above website.
-
-# Description: produce flow based on model predict and write into the dataset, 
-#              then use `tools/visualization.py` to visualize the flow.
-"""
-
-import torch
-from torch.utils.data import DataLoader
-import lightning.pytorch as pl
-from lightning.pytorch.loggers import WandbLogger
-from omegaconf import DictConfig, OmegaConf
-import hydra, wandb, os, sys
-from hydra.core.hydra_config import HydraConfig
-from src.dataset import HDF5Dataset
-from src.trainer import ModelWrapper
-from src.utils import bc
-
-@hydra.main(version_base=None, config_path="conf", config_name="save")
-def main(cfg):
-    pl.seed_everything(cfg.seed, workers=True)
-    output_dir = HydraConfig.get().runtime.output_dir
-
-    if not os.path.exists(cfg.checkpoint):
-        print(f"Checkpoint {cfg.checkpoint} does not exist. Need checkpoints for evaluation.")
-        sys.exit(1)
-
-    if cfg.res_name is None:
-        cfg.res_name = cfg.checkpoint.split("/")[-1].split(".")[0]
-        print(f"{bc.BOLD}NOTE{bc.ENDC}: res_name is not specified, use {bc.OKBLUE}{cfg.res_name}{bc.ENDC} as default.")
-
-    checkpoint_params = DictConfig(torch.load(cfg.checkpoint)["hyper_parameters"])
-    cfg.output = checkpoint_params.cfg.output
-    cfg.model.update(checkpoint_params.cfg.model)
-    mymodel = ModelWrapper.load_from_checkpoint(cfg.checkpoint, cfg=cfg, eval=True)
-
-    wandb_logger = WandbLogger(save_dir=output_dir,
-                               entity="kth-rpl",
-                               project=f"deflow-eval", 
-                               name=f"{cfg.output}",
-                               offline=True)
-    
-    trainer = pl.Trainer(logger=wandb_logger, devices=1)
-    # NOTE(Qingwen): search & check in pl_model.py : def test_step(self, batch, res_dict)
-    trainer.test(model = mymodel, \
-                 dataloaders = DataLoader(\
-                     HDF5Dataset(cfg.dataset_path, n_frames=checkpoint_params.cfg.num_frames if 'num_frames' in checkpoint_params.cfg else 2), \
-                    batch_size=1, shuffle=False))
-    wandb.finish()
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/src/dataset.py b/src/dataset.py
deleted file mode 100644
index 20b4fef..0000000
--- a/src/dataset.py
+++ /dev/null
@@ -1,217 +0,0 @@
-"""
-# Created: 2023-11-04 15:52
-# Updated: 2024-07-12 23:16
-# 
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/), Jaeyeul Kim (jykim94@dgist.ac.kr)
-#
-# Change Logs:
-# 2024-07-12: Merged num_frame based on Flow4D model from Jaeyeul Kim.
-# 
-# Description: Torch dataloader for the dataset we preprocessed.
-# 
-"""
-
-import torch
-from torch.utils.data import Dataset, DataLoader
-import h5py, os, pickle, argparse, sys
-from tqdm import tqdm
-BASE_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '..' ))
-sys.path.append(BASE_DIR)
-
-def collate_fn_pad(batch):
-
-    num_frames = 2
-    while f'pch{num_frames - 1}' in batch[0]:
-        num_frames += 1
-
-    # padding the data
-    pc0_after_mask_ground, pc1_after_mask_ground= [], []
-    pch_after_mask_ground = [[] for _ in range(num_frames - 2)]
-    for i in range(len(batch)):
-        pc0_after_mask_ground.append(batch[i]['pc0'][~batch[i]['gm0']])
-        pc1_after_mask_ground.append(batch[i]['pc1'][~batch[i]['gm1']])
-        for j in range(1, num_frames - 1):
-            pch_after_mask_ground[j-1].append(batch[i][f'pch{j}'][~batch[i][f'gmh{j}']])
-
-    pc0_after_mask_ground = torch.nn.utils.rnn.pad_sequence(pc0_after_mask_ground, batch_first=True, padding_value=torch.nan)
-    pc1_after_mask_ground = torch.nn.utils.rnn.pad_sequence(pc1_after_mask_ground, batch_first=True, padding_value=torch.nan)
-    pch_after_mask_ground = [torch.nn.utils.rnn.pad_sequence(pch_, batch_first=True, padding_value=torch.nan) for pch_ in pch_after_mask_ground]
-
-
-    res_dict =  {
-        'pc0': pc0_after_mask_ground,
-        'pc1': pc1_after_mask_ground,
-        'pose0': [batch[i]['pose0'] for i in range(len(batch))],
-        'pose1': [batch[i]['pose1'] for i in range(len(batch))]
-    }
-
-    for j in range(1, num_frames - 1):
-        res_dict[f'pch{j}'] = pch_after_mask_ground[j-1]
-        res_dict[f'poseh{j}'] = [batch[i][f'poseh{j}'] for i in range(len(batch))]
-
-    if 'flow' in batch[0]:
-        flow = torch.nn.utils.rnn.pad_sequence([batch[i]['flow'][~batch[i]['gm0']] for i in range(len(batch))], batch_first=True)
-        flow_is_valid = torch.nn.utils.rnn.pad_sequence([batch[i]['flow_is_valid'][~batch[i]['gm0']] for i in range(len(batch))], batch_first=True)
-        flow_category_indices = torch.nn.utils.rnn.pad_sequence([batch[i]['flow_category_indices'][~batch[i]['gm0']] for i in range(len(batch))], batch_first=True)
-        res_dict['flow'] = flow
-        res_dict['flow_is_valid'] = flow_is_valid
-        res_dict['flow_category_indices'] = flow_category_indices
-
-    if 'ego_motion' in batch[0]:
-        res_dict['ego_motion'] = [batch[i]['ego_motion'] for i in range(len(batch))]
-        
-    if 'pc0_dynamic' in batch[0]:
-        pc0_dynamic_after_mask_ground, pc1_dynamic_after_mask_ground= [], []
-        for i in range(len(batch)):
-            pc0_dynamic_after_mask_ground.append(batch[i]['pc0_dynamic'][~batch[i]['gm0']])
-            pc1_dynamic_after_mask_ground.append(batch[i]['pc1_dynamic'][~batch[i]['gm1']])
-        pc0_dynamic_after_mask_ground = torch.nn.utils.rnn.pad_sequence(pc0_dynamic_after_mask_ground, batch_first=True, padding_value=0)
-        pc1_dynamic_after_mask_ground = torch.nn.utils.rnn.pad_sequence(pc1_dynamic_after_mask_ground, batch_first=True, padding_value=0)
-        res_dict['pc0_dynamic'] = pc0_dynamic_after_mask_ground
-        res_dict['pc1_dynamic'] = pc1_dynamic_after_mask_ground
-
-    return res_dict
-class HDF5Dataset(Dataset):
-    def __init__(self, directory, n_frames=2, dufo=False, eval = False, leaderboard_version=1):
-        '''
-        directory: the directory of the dataset
-        n_frames: the number of frames we use, default is 2: current, next if more then it's the history from current.
-        dufo: if True, we will read the dynamic cluster label
-        eval: if True, use the eval index
-        '''
-        super(HDF5Dataset, self).__init__()
-        self.directory = directory
-        
-        with open(os.path.join(self.directory, 'index_total.pkl'), 'rb') as f:
-            self.data_index = pickle.load(f)
-
-        self.eval_index = False
-        self.dufo = dufo
-        self.n_frames = n_frames
-
-        if eval:
-            eval_index_file = os.path.join(self.directory, 'index_eval.pkl')
-            if leaderboard_version == 2:
-                print("Using index to leaderboard version 2!!")
-                eval_index_file = os.path.join(BASE_DIR, 'assets/docs/index_eval_v2.pkl')
-            if not os.path.exists(eval_index_file):
-                raise Exception(f"No eval index file found! Please check {self.directory}")
-            self.eval_index = eval
-            with open(os.path.join(self.directory, eval_index_file), 'rb') as f:
-                self.eval_data_index = pickle.load(f)
-                
-        self.scene_id_bounds = {}  # 存储每个scene_id的最大最小timestamp和位置
-        for idx, (scene_id, timestamp) in enumerate(self.data_index):
-            if scene_id not in self.scene_id_bounds:
-                self.scene_id_bounds[scene_id] = {
-                    "min_timestamp": timestamp,
-                    "max_timestamp": timestamp,
-                    "min_index": idx,
-                    "max_index": idx
-                }
-            else:
-                bounds = self.scene_id_bounds[scene_id]
-                # 更新最小timestamp和位置
-                if timestamp < bounds["min_timestamp"]:
-                    bounds["min_timestamp"] = timestamp
-                    bounds["min_index"] = idx
-                # 更新最大timestamp和位置
-                if timestamp > bounds["max_timestamp"]:
-                    bounds["max_timestamp"] = timestamp
-                    bounds["max_index"] = idx
-
-    def __len__(self):
-        if self.eval_index:
-            return len(self.eval_data_index)
-        return len(self.data_index)
-    
-    def __getitem__(self, index_):
-        if self.eval_index:
-            scene_id, timestamp = self.eval_data_index[index_]
-            # find this one index in the total index
-            index_ = self.data_index.index([scene_id, timestamp])
-        else:
-            scene_id, timestamp = self.data_index[index_]
-            # to make sure we have continuous frames
-            if self.scene_id_bounds[scene_id]["max_index"] == index_:
-                index_ = index_ - 1
-            # get the data again
-            scene_id, timestamp = self.data_index[index_]
-
-        key = str(timestamp)
-        with h5py.File(os.path.join(self.directory, f'{scene_id}.h5'), 'r') as f:
-            pc0 = torch.tensor(f[key]['lidar'][:][:,:3])
-            gm0 = torch.tensor(f[key]['ground_mask'][:])
-            pose0 = torch.tensor(f[key]['pose'][:])
-
-            next_timestamp = str(self.data_index[index_+1][1])
-            pc1 = torch.tensor(f[next_timestamp]['lidar'][:][:,:3])
-            gm1 = torch.tensor(f[next_timestamp]['ground_mask'][:])
-            pose1 = torch.tensor(f[next_timestamp]['pose'][:])
-            res_dict = {
-                'scene_id': scene_id,
-                'timestamp': key,
-                'pc0': pc0,
-                'gm0': gm0,
-                'pose0': pose0,
-                'pc1': pc1,
-                'gm1': gm1,
-                'pose1': pose1,
-            }
-
-            if self.n_frames > 2: 
-                past_frames = []
-                num_past_frames = self.n_frames - 2  
-
-                for i in range(1, num_past_frames + 1):
-                    frame_index = index_ - i
-                    if frame_index < self.scene_id_bounds[scene_id]["min_index"]: 
-                        frame_index = self.scene_id_bounds[scene_id]["min_index"] 
-
-                    past_timestamp = str(self.data_index[frame_index][1])
-                    past_pc = torch.tensor(f[past_timestamp]['lidar'][:][:,:3])
-                    past_gm = torch.tensor(f[past_timestamp]['ground_mask'][:])
-                    past_pose = torch.tensor(f[past_timestamp]['pose'][:])
-
-                    past_frames.append((past_pc, past_gm, past_pose))
-
-                for i, (past_pc, past_gm, past_pose) in enumerate(past_frames):
-                    res_dict[f'pch{i+1}'] = past_pc
-                    res_dict[f'gmh{i+1}'] = past_gm
-                    res_dict[f'poseh{i+1}'] = past_pose
-
-            if 'flow' in f[key]:
-                flow = torch.tensor(f[key]['flow'][:])
-                flow_is_valid = torch.tensor(f[key]['flow_is_valid'][:])
-                flow_category_indices = torch.tensor(f[key]['flow_category_indices'][:])
-                res_dict['flow'] = flow
-                res_dict['flow_is_valid'] = flow_is_valid
-                res_dict['flow_category_indices'] = flow_category_indices
-
-            if 'ego_motion' in f[key]:
-                ego_motion = torch.tensor(f[key]['ego_motion'][:])
-                res_dict['ego_motion'] = ego_motion
-
-            if self.dufo:
-                res_dict['pc0_dynamic'] = torch.tensor(f[key]['label'][:].astype('int16'))
-                res_dict['pc1_dynamic'] = torch.tensor(f[next_timestamp]['label'][:].astype('int16'))
-
-            if self.eval_index:
-                # looks like v2 not follow the same rule as v1 with eval_mask provided
-                eval_mask = torch.tensor(f[key]['eval_mask'][:]) if 'eval_mask' in f[key] else torch.ones_like(pc0[:, 0], dtype=torch.bool)
-                res_dict['eval_mask'] = eval_mask
-
-        return res_dict
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="DataLoader test")
-    parser.add_argument('--data_mode', '-m', type=str, default='val', metavar='N', help='Dataset mode.')
-    parser.add_argument('--data_dir', '-d', type=str, default='/home/kin/data/av2/preprocess/sensor', metavar='N', help='preprocess data path.')
-    options = parser.parse_args()
-
-    # testing eval mode
-    dataset = HDF5Dataset(options.data_dir+"/"+options.data_mode, eval=True)
-    dataloader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=16, collate_fn=collate_fn_pad)
-    for data in tqdm(dataloader, ncols=80, desc="eval mode"):
-        res_dict = data
\ No newline at end of file
diff --git a/src/models/__init__.py b/src/models/__init__.py
deleted file mode 100644
index 3e7a139..0000000
--- a/src/models/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .deflow import DeFlow
-from .fastflow3d import FastFlow3D
\ No newline at end of file
diff --git a/src/models/basic/__init__.py b/src/models/basic/__init__.py
deleted file mode 100644
index 7d46b47..0000000
--- a/src/models/basic/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import torch
-import torch.nn as nn
-
-@torch.no_grad()
-def cal_pose0to1(pose0: torch.Tensor, pose1: torch.Tensor):
-    """
-    Note(Qingwen 2023-12-05 11:09):
-    Don't know why but it needed set the pose to float64 to calculate the inverse 
-    otherwise it will be not expected result....
-    """
-    pose1_inv = torch.eye(4, dtype=torch.float64, device=pose1.device)
-    pose1_inv[:3,:3] = pose1[:3,:3].T
-    pose1_inv[:3,3] = (pose1[:3,:3].T * -pose1[:3,3]).sum(axis=1)
-    pose_0to1 = pose1_inv @ pose0.type(torch.float64)
-    return pose_0to1.type(torch.float32)
-
-class ConvWithNorms(nn.Module):
-
-    def __init__(self, in_num_channels: int, out_num_channels: int,
-                 kernel_size: int, stride: int, padding: int):
-        super().__init__()
-        self.conv = nn.Conv2d(in_num_channels, out_num_channels, kernel_size,
-                              stride, padding)
-        self.batchnorm = nn.BatchNorm2d(out_num_channels)
-        self.nonlinearity = nn.GELU()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        conv_res = self.conv(x)
-        if conv_res.shape[2] == 1 and conv_res.shape[3] == 1:
-            # This is a hack to get around the fact that batchnorm doesn't support
-            # 1x1 convolutions
-            batchnorm_res = conv_res
-        else:
-            batchnorm_res = self.batchnorm(conv_res)
-        return self.nonlinearity(batchnorm_res)
\ No newline at end of file
diff --git a/src/models/basic/decoder.py b/src/models/basic/decoder.py
deleted file mode 100644
index 8206d5d..0000000
--- a/src/models/basic/decoder.py
+++ /dev/null
@@ -1,220 +0,0 @@
-import torch
-import torch.nn as nn
-from typing import List, Tuple, Dict
-from . import ConvWithNorms
-
-SPLIT_BATCH_SIZE = 512
-
-class MMHeadDecoder(nn.Module):
-
-    def __init__(self, pseudoimage_channels: int = 64):
-        super().__init__()
-
-        self.offset_encoder = nn.Linear(3, 128)
-
-        # FIXME: figure out how to set nheads and num_layers properly
-        # ref: https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html
-        # https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoderLayer.html
-        transform_decoder_layers = nn.TransformerDecoderLayer(d_model=128, nhead=4)
-        self.pts_off_transformer = nn.TransformerDecoder(transform_decoder_layers, num_layers=4)
-        
-        self.decoder = nn.Sequential(
-            nn.Linear(pseudoimage_channels*2, 32), nn.GELU(),
-            nn.Linear(32, 3))
-
-    def forward_single(self, before_pseudoimage: torch.Tensor,
-                       after_pseudoimage: torch.Tensor,
-                       point_offsets: torch.Tensor,
-                       voxel_coords: torch.Tensor) -> torch.Tensor:
-        voxel_coords = voxel_coords.long()
-        # assert (voxel_coords[:, 0] == 0).all(), "Z index must be 0"
-
-        # Voxel coords are Z, Y, X, and the pseudoimage is Channel, Y, X
-        # I have confirmed via visualization that these coordinates are correct.
-        after_voxel_vectors = after_pseudoimage[:, voxel_coords[:, 1],
-                                                voxel_coords[:, 2]].T
-        before_voxel_vectors = before_pseudoimage[:, voxel_coords[:, 1],
-                                                  voxel_coords[:, 2]].T
-        
-        # [N, 64] [N, 64] -> [N, 128]
-        concatenated_vectors = torch.cat([before_voxel_vectors, after_voxel_vectors], dim=1)
-        
-        # [N, 128] [N, 128] -> [N, 1, 128]
-        voxel_feature = concatenated_vectors.unsqueeze(1)
-        point_offsets_feature = self.offset_encoder(point_offsets).unsqueeze(1)
-        concatenated_feature = torch.zeros_like(voxel_feature)
-
-        for spilt_range in range(0, concatenated_feature.shape[0], SPLIT_BATCH_SIZE):
-            concatenated_feature[spilt_range:spilt_range+SPLIT_BATCH_SIZE] = self.pts_off_transformer(
-                voxel_feature[spilt_range:spilt_range+SPLIT_BATCH_SIZE],
-                point_offsets_feature[spilt_range:spilt_range+SPLIT_BATCH_SIZE]
-            )
-        
-        flow = self.decoder(concatenated_feature.squeeze(1))
-        return flow
-
-    def forward(
-            self, before_pseudoimages: torch.Tensor,
-            after_pseudoimages: torch.Tensor,
-            voxelizer_infos: List[Dict[str,
-                                       torch.Tensor]]) -> List[torch.Tensor]:
-
-        flow_results = []
-        for before_pseudoimage, after_pseudoimage, voxelizer_info in zip(
-                before_pseudoimages, after_pseudoimages, voxelizer_infos):
-            point_offsets = voxelizer_info["point_offsets"]
-            voxel_coords = voxelizer_info["voxel_coords"]
-            flow = self.forward_single(before_pseudoimage, after_pseudoimage,
-                                       point_offsets, voxel_coords)
-            flow_results.append(flow)
-        return flow_results
-    
-class LinearDecoder(nn.Module):
-
-    def __init__(self, pseudoimage_channels: int = 64):
-        super().__init__()
-
-        self.offset_encoder = nn.Linear(3, 128)
-
-        self.decoder = nn.Sequential(
-            nn.Linear(pseudoimage_channels*4, 32), nn.GELU(),
-            nn.Linear(32, 3))
-
-    def forward_single(self, before_pseudoimage: torch.Tensor,
-                       after_pseudoimage: torch.Tensor,
-                       point_offsets: torch.Tensor,
-                       voxel_coords: torch.Tensor) -> torch.Tensor:
-        voxel_coords = voxel_coords.long()
-        # assert (voxel_coords[:, 0] == 0).all(), "Z index must be 0"
-
-        # Voxel coords are Z, Y, X, and the pseudoimage is Channel, Y, X
-        # I have confirmed via visualization that these coordinates are correct.
-        after_voxel_vectors = after_pseudoimage[:, voxel_coords[:, 1],
-                                                voxel_coords[:, 2]].T
-        before_voxel_vectors = before_pseudoimage[:, voxel_coords[:, 1],
-                                                  voxel_coords[:, 2]].T
-        
-        # [N, 64] [N, 64] -> [N, 128]
-        concatenated_vectors = torch.cat([before_voxel_vectors, after_voxel_vectors], dim=1)
-        
-        # [N, 3] -> [N, 128]
-        point_offsets_feature = self.offset_encoder(point_offsets)
-
-        flow = self.decoder(torch.cat([concatenated_vectors, point_offsets_feature], dim=1))
-        return flow
-
-    def forward(
-            self, before_pseudoimages: torch.Tensor,
-            after_pseudoimages: torch.Tensor,
-            voxelizer_infos: List[Dict[str,
-                                       torch.Tensor]]) -> List[torch.Tensor]:
-
-        flow_results = []
-        for before_pseudoimage, after_pseudoimage, voxelizer_info in zip(
-                before_pseudoimages, after_pseudoimages, voxelizer_infos):
-            point_offsets = voxelizer_info["point_offsets"]
-            voxel_coords = voxelizer_info["voxel_coords"]
-            flow = self.forward_single(before_pseudoimage, after_pseudoimage,
-                                       point_offsets, voxel_coords)
-            flow_results.append(flow)
-        return flow_results
-
-# from https://github.com/weiyithu/PV-RAFT/blob/main/model/update.py
-class ConvGRU(nn.Module):
-    def __init__(self, input_dim=64, hidden_dim=128):
-        super(ConvGRU, self).__init__()
-        self.convz = nn.Conv1d(input_dim+hidden_dim, hidden_dim, 1)
-        self.convr = nn.Conv1d(input_dim+hidden_dim, hidden_dim, 1)
-        self.convq = nn.Conv1d(input_dim+hidden_dim, hidden_dim, 1)
-
-    def forward(self, h, x):
-        hx = torch.cat([h, x], dim=1)
-
-        z = torch.sigmoid(self.convz(hx))
-        r = torch.sigmoid(self.convr(hx))
-        rh_x = torch.cat([r*h, x], dim=1)
-        q = torch.tanh(self.convq(rh_x))
-
-        h = (1 - z) * h + z * q
-        return h
-    
-class ConvGRUDecoder(nn.Module):
-
-    def __init__(self, pseudoimage_channels: int = 64, num_iters: int = 4):
-        super().__init__()
-
-        self.offset_encoder = nn.Linear(3, pseudoimage_channels)
-
-        # NOTE: voxel feature is hidden input, point offset is input, check paper's Fig. 3
-        self.gru = ConvGRU(input_dim=pseudoimage_channels, hidden_dim=pseudoimage_channels*2)
-
-        self.decoder = nn.Sequential(
-            nn.Linear(pseudoimage_channels*3, pseudoimage_channels//2), nn.GELU(),
-            nn.Linear(pseudoimage_channels//2, 3))
-        self.num_iters = num_iters
-
-    def forward_single(self, before_pseudoimage: torch.Tensor,
-                       after_pseudoimage: torch.Tensor,
-                       point_offsets: torch.Tensor,
-                       voxel_coords: torch.Tensor) -> torch.Tensor:
-        voxel_coords = voxel_coords.long()
-        # assert (voxel_coords[:, 0] == 0).all(), "Z index must be 0"
-
-        # Voxel coords are Z, Y, X, and the pseudoimage is Channel, Y, X
-        # I have confirmed via visualization that these coordinates are correct.
-        after_voxel_vectors = after_pseudoimage[:, voxel_coords[:, 1],
-                                                voxel_coords[:, 2]].T
-        before_voxel_vectors = before_pseudoimage[:, voxel_coords[:, 1],
-                                                  voxel_coords[:, 2]].T
-        
-        # [N, 64] [N, 64] -> [N, 128]
-        concatenated_vectors = torch.cat([before_voxel_vectors, after_voxel_vectors], dim=1)
-        
-        # [N, 3] -> [N, 64]
-        point_offsets_feature = self.offset_encoder(point_offsets)
-        
-        # [N, 128] -> [N, 128, 1]
-        concatenated_vectors = concatenated_vectors.unsqueeze(2)
-
-        for itr in range(self.num_iters):
-            concatenated_vectors = self.gru(concatenated_vectors, point_offsets_feature.unsqueeze(2))
-
-        flow = self.decoder(torch.cat([concatenated_vectors.squeeze(2), point_offsets_feature], dim=1))
-        return flow
-
-    def forward(
-            self, before_pseudoimages: torch.Tensor,
-            after_pseudoimages: torch.Tensor,
-            voxelizer_infos: List[Dict[str,
-                                       torch.Tensor]]) -> List[torch.Tensor]:
-
-        flow_results = []
-        for before_pseudoimage, after_pseudoimage, voxelizer_info in zip(
-                before_pseudoimages, after_pseudoimages, voxelizer_infos):
-            point_offsets = voxelizer_info["point_offsets"]
-            voxel_coords = voxelizer_info["voxel_coords"]
-            flow = self.forward_single(before_pseudoimage, after_pseudoimage,
-                                       point_offsets, voxel_coords)
-            flow_results.append(flow)
-        return flow_results
-
-
-class ConvWithNorms(nn.Module):
-
-    def __init__(self, in_num_channels: int, out_num_channels: int,
-                 kernel_size: int, stride: int, padding: int):
-        super().__init__()
-        self.conv = nn.Conv2d(in_num_channels, out_num_channels, kernel_size,
-                              stride, padding)
-        self.batchnorm = nn.BatchNorm2d(out_num_channels)
-        self.nonlinearity = nn.GELU()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        conv_res = self.conv(x)
-        if conv_res.shape[2] == 1 and conv_res.shape[3] == 1:
-            # This is a hack to get around the fact that batchnorm doesn't support
-            # 1x1 convolutions
-            batchnorm_res = conv_res
-        else:
-            batchnorm_res = self.batchnorm(conv_res)
-        return self.nonlinearity(batchnorm_res)
\ No newline at end of file
diff --git a/src/models/basic/encoder.py b/src/models/basic/encoder.py
deleted file mode 100644
index 88e6c07..0000000
--- a/src/models/basic/encoder.py
+++ /dev/null
@@ -1,588 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import List, Tuple
-
-from assets.cuda.mmcv import Voxelization
-from assets.cuda.mmcv import DynamicScatter
-
-def get_paddings_indicator(actual_num, max_num, axis=0):
-    """Create boolean mask by actually number of a padded tensor.
-    Args:
-        actual_num (torch.Tensor): Actual number of points in each voxel.
-        max_num (int): Max number of points in each voxel
-    Returns:
-        torch.Tensor: Mask indicates which points are valid inside a voxel.
-    """
-    actual_num = torch.unsqueeze(actual_num, axis + 1)
-    # tiled_actual_num: [N, M, 1]
-    max_num_shape = [1] * len(actual_num.shape)
-    max_num_shape[axis + 1] = -1
-    max_num = torch.arange(max_num, dtype=torch.int,
-                           device=actual_num.device).view(max_num_shape)
-    # tiled_actual_num: [[3,3,3,3,3], [4,4,4,4,4], [2,2,2,2,2]]
-    # tiled_max_num: [[0,1,2,3,4], [0,1,2,3,4], [0,1,2,3,4]]
-    paddings_indicator = actual_num.int() > max_num
-    # paddings_indicator shape: [batch_size, max_num]
-    return paddings_indicator
-
-class PFNLayer(nn.Module):
-    """Pillar Feature Net Layer.
-    The Pillar Feature Net is composed of a series of these layers, but the
-    PointPillars paper results only used a single PFNLayer.
-    Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        last_layer (bool, optional): If last_layer, there is no
-            concatenation of features. Defaults to False.
-        mode (str, optional): Pooling model to gather features inside voxels.
-            Defaults to 'max'.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 last_layer=False,
-                 mode='max'):
-
-        super().__init__()
-        self.fp16_enabled = False
-        self.name = 'PFNLayer'
-        self.last_vfe = last_layer
-        if not self.last_vfe:
-            out_channels = out_channels // 2
-        self.units = out_channels
-
-        self.norm = nn.BatchNorm1d(self.units, eps=1e-3, momentum=0.01)
-        self.linear = nn.Linear(in_channels, self.units, bias=False)
-
-        assert mode in ['max', 'avg']
-        self.mode = mode
-
-    def forward(self, inputs, num_voxels=None, aligned_distance=None):
-        """Forward function.
-        Args:
-            inputs (torch.Tensor): Pillar/Voxel inputs with shape (N, M, C).
-                N is the number of voxels, M is the number of points in
-                voxels, C is the number of channels of point features.
-            num_voxels (torch.Tensor, optional): Number of points in each
-                voxel. Defaults to None.
-            aligned_distance (torch.Tensor, optional): The distance of
-                each points to the voxel center. Defaults to None.
-        Returns:
-            torch.Tensor: Features of Pillars.
-        """
-        x = self.linear(inputs)
-        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
-                                                               1).contiguous()
-        x = F.gelu(x)
-
-        if self.mode == 'max':
-            if aligned_distance is not None:
-                x = x.mul(aligned_distance.unsqueeze(-1))
-            x_max = torch.max(x, dim=1, keepdim=True)[0]
-        elif self.mode == 'avg':
-            if aligned_distance is not None:
-                x = x.mul(aligned_distance.unsqueeze(-1))
-            x_max = x.sum(dim=1,
-                          keepdim=True) / num_voxels.type_as(inputs).view(
-                              -1, 1, 1)
-
-        if self.last_vfe:
-            return x_max
-        else:
-            x_repeat = x_max.repeat(1, inputs.shape[1], 1)
-            x_concatenated = torch.cat([x, x_repeat], dim=2)
-            return x_concatenated
-
-class PointPillarsScatter(nn.Module):
-    """Point Pillar's Scatter.
-
-    Converts learned features from dense tensor to sparse pseudo image.
-
-    Args:
-        in_channels (int): Channels of input features.
-        output_shape (list[int]): Required output shape of features.
-    """
-
-    def __init__(self, in_channels, output_shape):
-        super().__init__()
-        self.output_shape = output_shape
-        self.ny = output_shape[0]
-        self.nx = output_shape[1]
-        self.in_channels = in_channels
-        self.fp16_enabled = False
-
-    def forward(self, voxel_features, coors, batch_size=None):
-        """Foraward function to scatter features."""
-        # TODO: rewrite the function in a batch manner
-        # no need to deal with different batch cases
-        if batch_size is not None:
-            return self.forward_batch(voxel_features, coors, batch_size)
-        else:
-            return self.forward_single(voxel_features, coors)
-
-    def forward_single(self, voxel_features, coors):
-        """Scatter features of single sample.
-
-        Args:
-            voxel_features (torch.Tensor): Voxel features in shape (N, M, C).
-            coors (torch.Tensor): Coordinates of each voxel.
-        """
-        # Create the canvas for this sample
-        canvas = torch.zeros(
-            self.in_channels,
-            self.nx * self.ny,
-            dtype=voxel_features.dtype,
-            device=voxel_features.device)
-
-        indices = coors[:, 1] * self.nx + coors[:, 2]
-        indices = indices.long()
-        voxels = voxel_features.t()
-        # Now scatter the blob back to the canvas.
-        canvas[:, indices] = voxels
-        # Undo the column stacking to final 4-dim tensor
-        canvas = canvas.view(1, self.in_channels, self.ny, self.nx)
-        return canvas
-
-    def forward_batch(self, voxel_features, coors, batch_size):
-        """Scatter features of single sample.
-
-        Args:
-            voxel_features (torch.Tensor): Voxel features in shape (N, M, C).
-            coors (torch.Tensor): Coordinates of each voxel in shape (N, 4).
-                The first column indicates the sample ID.
-            batch_size (int): Number of samples in the current batch.
-        """
-        # batch_canvas will be the final output.
-        batch_canvas = []
-        for batch_itt in range(batch_size):
-            # Create the canvas for this sample
-            canvas = torch.zeros(
-                self.in_channels,
-                self.nx * self.ny,
-                dtype=voxel_features.dtype,
-                device=voxel_features.device)
-
-            # Only include non-empty pillars
-            batch_mask = coors[:, 0] == batch_itt
-            this_coors = coors[batch_mask, :]
-            indices = this_coors[:, 2] * self.nx + this_coors[:, 3]
-            indices = indices.type(torch.long)
-            voxels = voxel_features[batch_mask, :]
-            voxels = voxels.t()
-
-            # Now scatter the blob back to the canvas.
-            canvas[:, indices] = voxels
-
-            # Append to a list for later stacking.
-            batch_canvas.append(canvas)
-
-        # Stack to 3-dim tensor (batch-size, in_channels, nrows*ncols)
-        batch_canvas = torch.stack(batch_canvas, 0)
-
-        # Undo the column stacking to final 4-dim tensor
-        batch_canvas = batch_canvas.view(batch_size, self.in_channels, self.ny,
-                                         self.nx)
-
-        return batch_canvas
-    
-class PillarFeatureNet(nn.Module):
-    """Pillar Feature Net.
-    The network prepares the pillar features and performs forward pass
-    through PFNLayers.
-    Args:
-        in_channels (int, optional): Number of input features,
-            either x, y, z or x, y, z, r. Defaults to 4.
-        feat_channels (tuple, optional): Number of features in each of the
-            N PFNLayers. Defaults to (64, ).
-        with_distance (bool, optional): Whether to include Euclidean distance
-            to points. Defaults to False.
-        with_cluster_center (bool, optional): [description]. Defaults to True.
-        with_voxel_center (bool, optional): [description]. Defaults to True.
-        voxel_size (tuple[float], optional): Size of voxels, only utilize x
-            and y size. Defaults to (0.2, 0.2, 4).
-        point_cloud_range (tuple[float], optional): Point cloud range, only
-            utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1).
-        mode (str, optional): The mode to gather point features. Options are
-            'max' or 'avg'. Defaults to 'max'.
-        legacy (bool, optional): Whether to use the new behavior or
-            the original behavior. Defaults to True.
-    """
-
-    def __init__(self,
-                 in_channels=4,
-                 feat_channels=(64, ),
-                 with_distance=False,
-                 with_cluster_center=True,
-                 with_voxel_center=True,
-                 voxel_size=(0.2, 0.2, 4),
-                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
-                 mode='max'):
-        super(PillarFeatureNet, self).__init__()
-        assert len(feat_channels) > 0
-        if with_cluster_center:
-            in_channels += 3
-        if with_voxel_center:
-            in_channels += 3
-        if with_distance:
-            in_channels += 1
-        self._with_distance = with_distance
-        self._with_cluster_center = with_cluster_center
-        self._with_voxel_center = with_voxel_center
-        self.fp16_enabled = False
-        # Create PillarFeatureNet layers
-        self.in_channels = in_channels
-        feat_channels = [in_channels] + list(feat_channels)
-        pfn_layers = []
-        for i in range(len(feat_channels) - 1):
-            in_filters = feat_channels[i]
-            out_filters = feat_channels[i + 1]
-            if i < len(feat_channels) - 2:
-                last_layer = False
-            else:
-                last_layer = True
-            pfn_layers.append(
-                PFNLayer(in_filters,
-                         out_filters,
-                         last_layer=last_layer,
-                         mode=mode))
-        self.pfn_layers = nn.ModuleList(pfn_layers)
-
-        # Need pillar (voxel) size and x/y offset in order to calculate offset
-        self.vx = voxel_size[0]
-        self.vy = voxel_size[1]
-        self.vz = voxel_size[2]
-        self.x_offset = self.vx / 2 + point_cloud_range[0]
-        self.y_offset = self.vy / 2 + point_cloud_range[1]
-        self.z_offset = self.vz / 2 + point_cloud_range[2]
-        self.point_cloud_range = point_cloud_range
-
-    def forward(self, features, num_points, coors):
-        """Forward function.
-        Args:
-            features (torch.Tensor): Point features or raw points in shape
-                (N, M, C).
-            num_points (torch.Tensor): Number of points in each pillar.
-            coors (torch.Tensor): Coordinates of each voxel.
-        Returns:
-            torch.Tensor: Features of pillars.
-        """
-        features_ls = [features]
-        # Find distance of x, y, and z from cluster center
-        if self._with_cluster_center:
-            points_mean = features[:, :, :3].sum(
-                dim=1, keepdim=True) / num_points.type_as(features).view(
-                    -1, 1, 1)
-            f_cluster = features[:, :, :3] - points_mean
-            features_ls.append(f_cluster)
-
-        # Find distance of x, y, and z from pillar center
-        dtype = features.dtype
-        if self._with_voxel_center:
-            f_center = torch.zeros_like(features[:, :, :3])
-            f_center[:, :, 0] = features[:, :, 0] - (
-                coors[:, 2].to(dtype).unsqueeze(1) * self.vx + self.x_offset)
-            f_center[:, :, 1] = features[:, :, 1] - (
-                coors[:, 1].to(dtype).unsqueeze(1) * self.vy + self.y_offset)
-            f_center[:, :, 2] = features[:, :, 2] - (
-                coors[:, 0].to(dtype).unsqueeze(1) * self.vz + self.z_offset)
-            features_ls.append(f_center)
-
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
-            features_ls.append(points_dist)
-
-        # Combine together feature decorations
-        features = torch.cat(features_ls, dim=-1)
-        # The feature decorations were calculated without regard to whether
-        # pillar was empty. Need to ensure that
-        # empty pillars remain set to zeros.
-        voxel_count = features.shape[1]
-        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
-        mask = torch.unsqueeze(mask, -1).type_as(features)
-        features *= mask
-
-        for pfn in self.pfn_layers:
-            features = pfn(features, num_points)
-
-        return features.squeeze(1)
-
-class DynamicPillarFeatureNet(PillarFeatureNet):
-    """Pillar Feature Net using dynamic voxelization.
-    The network prepares the pillar features and performs forward pass
-    through PFNLayers. The main difference is that it is used for
-    dynamic voxels, which contains different number of points inside a voxel
-    without limits.
-    Args:
-        in_channels (int, optional): Number of input features,
-            either x, y, z or x, y, z, r. Defaults to 4.
-        feat_channels (tuple, optional): Number of features in each of the
-            N PFNLayers. Defaults to (64, ).
-        with_distance (bool, optional): Whether to include Euclidean distance
-            to points. Defaults to False.
-        with_cluster_center (bool, optional): [description]. Defaults to True.
-        with_voxel_center (bool, optional): [description]. Defaults to True.
-        voxel_size (tuple[float], optional): Size of voxels, only utilize x
-            and y size. Defaults to (0.2, 0.2, 4).
-        point_cloud_range (tuple[float], optional): Point cloud range, only
-            utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1).
-        norm_cfg ([type], optional): [description].
-            Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
-        mode (str, optional): The mode to gather point features. Options are
-            'max' or 'avg'. Defaults to 'max'.
-        legacy (bool, optional): Whether to use the new behavior or
-            the original behavior. Defaults to True.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 voxel_size,
-                 point_cloud_range,
-                 feat_channels=(64, ),
-                 with_distance=False,
-                 with_cluster_center=True,
-                 with_voxel_center=True,
-                 mode='max'):
-        super(DynamicPillarFeatureNet,
-              self).__init__(in_channels,
-                             feat_channels,
-                             with_distance,
-                             with_cluster_center=with_cluster_center,
-                             with_voxel_center=with_voxel_center,
-                             voxel_size=voxel_size,
-                             point_cloud_range=point_cloud_range,
-                             mode=mode)
-        self.fp16_enabled = False
-        feat_channels = [self.in_channels] + list(feat_channels)
-        pfn_layers = []
-        # TODO: currently only support one PFNLayer
-
-        for i in range(len(feat_channels) - 1):
-            in_filters = feat_channels[i]
-            out_filters = feat_channels[i + 1]
-            if i > 0:
-                in_filters *= 2
-            pfn_layers.append(
-                nn.Sequential(
-                    nn.Linear(in_filters, out_filters, bias=False),
-                    nn.BatchNorm1d(out_filters, eps=1e-3, momentum=0.01),
-                    nn.ReLU(inplace=True)))
-        self.num_pfn = len(pfn_layers)
-        self.pfn_layers = nn.ModuleList(pfn_layers)
-        self.pfn_scatter = DynamicScatter(voxel_size, point_cloud_range,
-                                          (mode != 'max'))
-        self.cluster_scatter = DynamicScatter(voxel_size,
-                                              point_cloud_range,
-                                              average_points=True)
-
-    def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
-        """Map the centers of voxels to its corresponding points.
-        Args:
-            pts_coors (torch.Tensor): The coordinates of each points, shape
-                (M, 3), where M is the number of points.
-            voxel_mean (torch.Tensor): The mean or aggregated features of a
-                voxel, shape (N, C), where N is the number of voxels.
-            voxel_coors (torch.Tensor): The coordinates of each voxel.
-        Returns:
-            torch.Tensor: Corresponding voxel centers of each points, shape
-                (M, C), where M is the number of points.
-        """
-        if pts_coors.shape[0] == 0:
-            return torch.zeros((0, voxel_mean.shape[1]),
-                               dtype=voxel_mean.dtype,
-                               device=voxel_mean.device)
-        # Step 1: scatter voxel into canvas
-        # Calculate necessary things for canvas creation
-        assert voxel_mean.shape[0] == voxel_coors.shape[
-            0], f"voxel_mean.shape[0] {voxel_mean.shape[0]} != voxel_coors.shape[0] {voxel_coors.shape[0]}"
-        assert pts_coors.shape[
-            1] == 3, f"pts_coors.shape[1] {pts_coors.shape[1]} != 3"
-        assert voxel_coors.shape[
-            1] == 3, f"voxel_coors.shape[1] {voxel_coors.shape[1]} != 3"
-
-        canvas_y = int(
-            (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
-        canvas_x = int(
-            (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
-        canvas_channel = voxel_mean.size(1)
-        batch_size = pts_coors[:, 0].max() + 1
-
-        canvas_len = canvas_y * canvas_x * batch_size
-        # Create the canvas for this sample
-        canvas = voxel_mean.new_zeros(canvas_channel, canvas_len)
-        # Only include non-empty pillars
-        indices = (voxel_coors[:, 0] * canvas_y * canvas_x +
-                   voxel_coors[:, 2] * canvas_x + voxel_coors[:, 1])
-        assert indices.long().max() < canvas_len, 'Index out of range'
-        assert indices.long().min() >= 0, 'Index out of range'
-        # Scatter the blob back to the canvas
-        canvas[:, indices.long()] = voxel_mean.t()
-        # Step 2: get voxel mean for each point
-        voxel_index = (pts_coors[:, 0] * canvas_y * canvas_x +
-                       pts_coors[:, 2] * canvas_x + pts_coors[:, 1])
-        assert voxel_index.long().max() < canvas_len, 'Index out of range'
-        assert voxel_index.long().min() >= 0, 'Index out of range'
-        center_per_point = canvas[:, voxel_index.long()].t()
-        return center_per_point
-
-    def forward(self, features, coors):
-        """Forward function.
-        Args:
-            features (torch.Tensor): Point features or raw points in shape
-                (N, M, C).
-            coors (torch.Tensor): Coordinates of each voxel
-        Returns:
-            torch.Tensor: Features of pillars.
-        """
-        features_ls = [features]
-        # Find distance of x, y, and z from cluster center
-        if self._with_cluster_center:
-            voxel_mean, mean_coors = self.cluster_scatter(features, coors)
-            points_mean = self.map_voxel_center_to_point(
-                coors, voxel_mean, mean_coors)
-            # TODO: maybe also do cluster for reflectivity
-            f_cluster = features[:, :3] - points_mean[:, :3]
-            features_ls.append(f_cluster)
-
-        # Find distance of x, y, and z from pillar center
-        if self._with_voxel_center:
-            f_center = features.new_zeros(size=(features.size(0), 3))
-            f_center[:, 0] = features[:, 0] - (
-                coors[:, 2].type_as(features) * self.vx + self.x_offset)
-            f_center[:, 1] = features[:, 1] - (
-                coors[:, 1].type_as(features) * self.vy + self.y_offset)
-            f_center[:, 2] = features[:, 2] - (
-                coors[:, 0].type_as(features) * self.vz + self.z_offset)
-            features_ls.append(f_center)
-
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
-            features_ls.append(points_dist)
-
-        # Combine together feature decorations
-        features = torch.cat(features_ls, dim=-1)
-        for i, pfn in enumerate(self.pfn_layers):
-            point_feats = pfn(features)
-            voxel_feats, voxel_coors = self.pfn_scatter(point_feats, coors)
-            if i != len(self.pfn_layers) - 1:
-                # need to concat voxel feats if it is not the last pfn
-                feat_per_point = self.map_voxel_center_to_point(
-                    coors, voxel_feats, voxel_coors)
-                features = torch.cat([point_feats, feat_per_point], dim=1)
-
-        return voxel_feats, voxel_coors, point_feats
-
-class HardVoxelizer(nn.Module):
-
-    def __init__(self, voxel_size, point_cloud_range,
-                 max_points_per_voxel: int):
-        super().__init__()
-        assert max_points_per_voxel > 0, f"max_points_per_voxel must be > 0, got {max_points_per_voxel}"
-
-        self.voxelizer = Voxelization(voxel_size,
-                                      point_cloud_range,
-                                      max_points_per_voxel,
-                                      deterministic=False)
-
-    def forward(self, points: torch.Tensor):
-        assert isinstance(
-            points,
-            torch.Tensor), f"points must be a torch.Tensor, got {type(points)}"
-        not_nan_mask = ~torch.isnan(points).any(dim=2)
-        return {"voxel_coords": self.voxelizer(points[not_nan_mask])}
-
-class DynamicVoxelizer(nn.Module):
-
-    def __init__(self, voxel_size, point_cloud_range):
-        super().__init__()
-        self.voxel_size = voxel_size
-        self.point_cloud_range = point_cloud_range
-        self.voxelizer = Voxelization(voxel_size,
-                                      point_cloud_range,
-                                      max_num_points=-1)
-
-    def _get_point_offsets(self, points: torch.Tensor,
-                           voxel_coords: torch.Tensor):
-
-        point_cloud_range = torch.tensor(self.point_cloud_range,
-                                         dtype=points.dtype,
-                                         device=points.device)
-        min_point = point_cloud_range[:3]
-        voxel_size = torch.tensor(self.voxel_size,
-                                  dtype=points.dtype,
-                                  device=points.device)
-
-        # Voxel coords are in the form Z, Y, X :eyeroll:, convert to X, Y, Z
-        voxel_coords = voxel_coords[:, [2, 1, 0]]
-
-        # Offsets are computed relative to min point
-        voxel_centers = voxel_coords * voxel_size + min_point + voxel_size / 2
-
-        return points - voxel_centers
-
-    def forward(
-            self,
-            points: torch.Tensor) -> List[Tuple[torch.Tensor, torch.Tensor]]:
-
-        batch_results = []
-        for batch_idx in range(len(points)):
-            batch_points = points[batch_idx]
-            valid_point_idxes = torch.arange(batch_points.shape[0],
-                                             device=batch_points.device)
-            not_nan_mask = ~torch.isnan(batch_points).any(dim=1)
-            batch_non_nan_points = batch_points[not_nan_mask]
-            valid_point_idxes = valid_point_idxes[not_nan_mask]
-            batch_voxel_coords = self.voxelizer(batch_non_nan_points)
-            # If any of the coords are -1, then the point is not in the voxel grid and should be discarded
-            batch_voxel_coords_mask = (batch_voxel_coords != -1).all(dim=1)
-
-            valid_batch_voxel_coords = batch_voxel_coords[
-                batch_voxel_coords_mask]
-            valid_batch_non_nan_points = batch_non_nan_points[
-                batch_voxel_coords_mask]
-            valid_point_idxes = valid_point_idxes[batch_voxel_coords_mask]
-
-            point_offsets = self._get_point_offsets(valid_batch_non_nan_points,
-                                                    valid_batch_voxel_coords)
-
-            result_dict = {
-                "points": valid_batch_non_nan_points,
-                "voxel_coords": valid_batch_voxel_coords,
-                "point_idxes": valid_point_idxes,
-                "point_offsets": point_offsets
-            }
-
-            batch_results.append(result_dict)
-        return batch_results
-
-class DynamicEmbedder(nn.Module):
-
-    def __init__(self, voxel_size, pseudo_image_dims, point_cloud_range,
-                 feat_channels: int) -> None:
-        super().__init__()
-        self.voxelizer = DynamicVoxelizer(voxel_size=voxel_size,
-                                          point_cloud_range=point_cloud_range)
-        self.feature_net = DynamicPillarFeatureNet(
-            in_channels=3,
-            feat_channels=(feat_channels, ),
-            point_cloud_range=point_cloud_range,
-            voxel_size=voxel_size,
-            mode='avg')
-        self.scatter = PointPillarsScatter(in_channels=feat_channels,
-                                           output_shape=pseudo_image_dims)
-
-    def forward(self, points: torch.Tensor) -> torch.Tensor:
-
-        # List of points and coordinates for each batch
-        voxel_info_list = self.voxelizer(points)
-
-        pseudoimage_lst = []
-        for voxel_info_dict in voxel_info_list:
-            points = voxel_info_dict['points']
-            coordinates = voxel_info_dict['voxel_coords']
-            voxel_feats, voxel_coors, _ = self.feature_net(points, coordinates)
-            pseudoimage = self.scatter(voxel_feats, voxel_coors)
-            pseudoimage_lst.append(pseudoimage)
-        # Concatenate the pseudoimages along the batch dimension
-        return torch.cat(pseudoimage_lst, dim=0), voxel_info_list
diff --git a/src/models/basic/nsfp_module.py b/src/models/basic/nsfp_module.py
deleted file mode 100644
index 8d293f8..0000000
--- a/src/models/basic/nsfp_module.py
+++ /dev/null
@@ -1,295 +0,0 @@
-"""
-This file is directly copied from: 
-https://github.com/Lilac-Lee/Neural_Scene_Flow_Prior
-"""
-import torch
-    
-class Neural_Prior(torch.nn.Module):
-    def __init__(self, dim_x=3, filter_size=128, act_fn='relu', layer_size=8):
-        super().__init__()
-        self.layer_size = layer_size
-        
-        self.nn_layers = torch.nn.ModuleList([])
-        # input layer (default: xyz -> 128)
-        if layer_size >= 1:
-            self.nn_layers.append(torch.nn.Sequential(torch.nn.Linear(dim_x, filter_size)))
-            if act_fn == 'relu':
-                self.nn_layers.append(torch.nn.ReLU())
-            elif act_fn == 'sigmoid':
-                self.nn_layers.append(torch.nn.Sigmoid())
-            for _ in range(layer_size-1):
-                self.nn_layers.append(torch.nn.Sequential(torch.nn.Linear(filter_size, filter_size)))
-                if act_fn == 'relu':
-                    self.nn_layers.append(torch.nn.ReLU())
-                elif act_fn == 'sigmoid':
-                    self.nn_layers.append(torch.nn.Sigmoid())
-            self.nn_layers.append(torch.nn.Linear(filter_size, dim_x))
-        else:
-            self.nn_layers.append(torch.nn.Sequential(torch.nn.Linear(dim_x, dim_x)))
-
-    def forward(self, x):
-        """ points -> features
-            [B, N, 3] -> [B, K]
-        """
-        for layer in self.nn_layers:
-            x = layer(x)
-                
-        return x
-
-# ANCHOR: early stopping strategy
-class EarlyStopping(object):
-    def __init__(self, mode='min', min_delta=0, patience=10, percentage=False):
-        self.mode = mode
-        self.min_delta = min_delta
-        self.patience = patience
-        self.best = None
-        self.num_bad_epochs = 0
-        self.is_better = None
-        self._init_is_better(mode, min_delta, percentage)
-
-        if patience == 0:
-            self.is_better = lambda a, b: True
-            self.step = lambda a: False
-
-    def step(self, metrics):
-        if self.best is None:
-            self.best = metrics
-            return False
-
-        if torch.isnan(metrics):
-            return True
-
-        if self.is_better(metrics, self.best):
-            self.num_bad_epochs = 0
-            self.best = metrics
-        else:
-            self.num_bad_epochs += 1
-
-        if self.num_bad_epochs >= self.patience:
-            return True
-
-        return False
-
-    def _init_is_better(self, mode, min_delta, percentage):
-        if mode not in {'min', 'max'}:
-            raise ValueError('mode ' + mode + ' is unknown!')
-        if not percentage:
-            if mode == 'min':
-                self.is_better = lambda a, best: a < best - min_delta
-            if mode == 'max':
-                self.is_better = lambda a, best: a > best + min_delta
-        else:
-            if mode == 'min':
-                self.is_better = lambda a, best: a < best - (
-                            best * min_delta / 100)
-            if mode == 'max':
-                self.is_better = lambda a, best: a > best + (
-                            best * min_delta / 100)
-                
-
-
-import torch.nn.functional as F
-from pytorch3d.ops.knn import knn_gather, knn_points
-from pytorch3d.structures.pointclouds import Pointclouds
-from typing import Union
-
-
-def _validate_chamfer_reduction_inputs(batch_reduction: Union[str, None],
-                                       point_reduction: str):
-    """Check the requested reductions are valid.
-    Args:
-        batch_reduction: Reduction operation to apply for the loss across the
-            batch, can be one of ["mean", "sum"] or None.
-        point_reduction: Reduction operation to apply for the loss across the
-            points, can be one of ["mean", "sum"].
-    """
-    if batch_reduction is not None and batch_reduction not in ["mean", "sum"]:
-        raise ValueError(
-            'batch_reduction must be one of ["mean", "sum"] or None')
-    if point_reduction not in ["mean", "sum"]:
-        raise ValueError('point_reduction must be one of ["mean", "sum"]')
-
-
-def _handle_pointcloud_input(
-    points: Union[torch.Tensor, Pointclouds],
-    lengths: Union[torch.Tensor, None],
-):
-    """
-    If points is an instance of Pointclouds, retrieve the padded points tensor
-    along with the number of points per batch and the padded normals.
-    Otherwise, return the input points (and normals) with the number of points per cloud
-    set to the size of the second dimension of `points`.
-    """
-    if isinstance(points, Pointclouds):
-        X = points.points_padded()
-        lengths = points.num_points_per_cloud()
-    elif torch.is_tensor(points):
-        if points.ndim != 3:
-            raise ValueError(
-                f"Expected points to be of shape (N, P, D), got {points.shape}"
-            )
-        X = points
-        if lengths is not None and (lengths.ndim != 1
-                                    or lengths.shape[0] != X.shape[0]):
-            raise ValueError("Expected lengths to be of shape (N,)")
-        if lengths is None:
-            lengths = torch.full((X.shape[0], ),
-                                 X.shape[1],
-                                 dtype=torch.int64,
-                                 device=points.device)
-    else:
-        raise ValueError("The input pointclouds should be either " +
-                         "Pointclouds objects or torch.Tensor of shape " +
-                         "(minibatch, num_points, 3).")
-    return X, lengths
-
-def my_chamfer_fn(
-    x,
-    y,
-    x_lengths=None,
-    y_lengths=None,
-    x_normals=None,
-    y_normals=None,
-    weights=None,
-    batch_reduction: Union[str, None] = "mean",
-    point_reduction: str = "mean",
-    truncate_dist=True,
-):
-    """
-    Chamfer distance between two pointclouds x and y.
-
-    Args:
-        x: FloatTensor of shape (N, P1, D) or a Pointclouds object representing
-            a batch of point clouds with at most P1 points in each batch element,
-            batch size N and feature dimension D.
-        y: FloatTensor of shape (N, P2, D) or a Pointclouds object representing
-            a batch of point clouds with at most P2 points in each batch element,
-            batch size N and feature dimension D.
-        x_lengths: Optional LongTensor of shape (N,) giving the number of points in each
-            cloud in x.
-        y_lengths: Optional LongTensor of shape (N,) giving the number of points in each
-            cloud in x.
-        x_normals: Optional FloatTensor of shape (N, P1, D).
-        y_normals: Optional FloatTensor of shape (N, P2, D).
-        weights: Optional FloatTensor of shape (N,) giving weights for
-            batch elements for reduction operation.
-        batch_reduction: Reduction operation to apply for the loss across the
-            batch, can be one of ["mean", "sum"] or None.
-        point_reduction: Reduction operation to apply for the loss across the
-            points, can be one of ["mean", "sum"].
-
-    Returns:
-        2-element tuple containing
-
-        - **loss**: Tensor giving the reduced distance between the pointclouds
-          in x and the pointclouds in y.
-        - **loss_normals**: Tensor giving the reduced cosine distance of normals
-          between pointclouds in x and pointclouds in y. Returns None if
-          x_normals and y_normals are None.
-    """
-    _validate_chamfer_reduction_inputs(batch_reduction, point_reduction)
-
-    x, x_lengths = _handle_pointcloud_input(x, None)
-    y, y_lengths = _handle_pointcloud_input(y, None)
-
-    if x_lengths.item() == 0 or y_lengths.item() == 0:
-        return 0.0, 0.0
-    # assert x_lengths.item() > 0, f"x_lengths is {x_lengths.item()}"
-    # assert y_lengths.item() > 0, f"y_lengths is {y_lengths.item()}"
-
-    return_normals = x_normals is not None and y_normals is not None
-
-    N, P1, D = x.shape
-    P2 = y.shape[1]
-
-    # Check if inputs are heterogeneous and create a lengths mask.
-    is_x_heterogeneous = (x_lengths != P1).any()
-    is_y_heterogeneous = (y_lengths != P2).any()
-    x_mask = (torch.arange(P1, device=x.device)[None] >= x_lengths[:, None]
-              )  # shape [N, P1]
-    y_mask = (torch.arange(P2, device=y.device)[None] >= y_lengths[:, None]
-              )  # shape [N, P2]
-
-    if y.shape[0] != N or y.shape[2] != D:
-        raise ValueError("y does not have the correct shape.")
-    if weights is not None:
-        if weights.size(0) != N:
-            raise ValueError("weights must be of shape (N,).")
-        if not (weights >= 0).all():
-            raise ValueError("weights cannot be negative.")
-        if weights.sum() == 0.0:
-            weights = weights.view(N, 1)
-            if batch_reduction in ["mean", "sum"]:
-                return (
-                    (x.sum((1, 2)) * weights).sum() * 0.0,
-                    (x.sum((1, 2)) * weights).sum() * 0.0,
-                )
-            return ((x.sum((1, 2)) * weights) * 0.0, (x.sum(
-                (1, 2)) * weights) * 0.0)
-
-    cham_norm_x = x.new_zeros(())
-    cham_norm_y = x.new_zeros(())
-
-    x_nn = knn_points(x, y, lengths1=x_lengths, lengths2=y_lengths, K=1)
-    y_nn = knn_points(y, x, lengths1=y_lengths, lengths2=x_lengths, K=1)
-
-    cham_x = x_nn.dists[..., 0]  # (N, P1)
-    cham_y = y_nn.dists[..., 0]  # (N, P2)
-
-    # NOTE: truncated Chamfer distance.
-    if truncate_dist:
-        dist_thd = 2
-        x_mask[cham_x >= dist_thd] = True
-        y_mask[cham_y >= dist_thd] = True
-        cham_x[x_mask] = 0.0
-        cham_y[y_mask] = 0.0
-
-    if is_x_heterogeneous:
-        cham_x[x_mask] = 0.0
-    if is_y_heterogeneous:
-        cham_y[y_mask] = 0.0
-
-    if weights is not None:
-        cham_x *= weights.view(N, 1)
-        cham_y *= weights.view(N, 1)
-
-    if return_normals:
-        # Gather the normals using the indices and keep only value for k=0
-        x_normals_near = knn_gather(y_normals, x_nn.idx, y_lengths)[..., 0, :]
-        y_normals_near = knn_gather(x_normals, y_nn.idx, x_lengths)[..., 0, :]
-
-        cham_norm_x = 1 - torch.abs(
-            F.cosine_similarity(x_normals, x_normals_near, dim=2, eps=1e-6))
-        cham_norm_y = 1 - torch.abs(
-            F.cosine_similarity(y_normals, y_normals_near, dim=2, eps=1e-6))
-
-        if is_x_heterogeneous:
-            # pyre-fixme[16]: `int` has no attribute `__setitem__`.
-            cham_norm_x[x_mask] = 0.0
-        if is_y_heterogeneous:
-            cham_norm_y[y_mask] = 0.0
-
-        if weights is not None:
-            cham_norm_x *= weights.view(N, 1)
-            cham_norm_y *= weights.view(N, 1)
-
-    # Apply point reduction
-    cham_x = cham_x.sum(1)  # (N,)
-    cham_y = cham_y.sum(1)  # (N,)
-
-    if point_reduction == "mean":
-        cham_x /= x_lengths
-        cham_y /= y_lengths
-
-    if batch_reduction is not None:
-        # batch_reduction == "sum"
-        cham_x = cham_x.sum()
-        cham_y = cham_y.sum()
-        if batch_reduction == "mean":
-            cham_x /= N
-            cham_y /= N
-
-    cham_dist = cham_x + cham_y
-    cham_normals = cham_norm_x + cham_norm_y if return_normals else None
-
-    return cham_dist, cham_normals
\ No newline at end of file
diff --git a/src/models/basic/unet.py b/src/models/basic/unet.py
deleted file mode 100644
index f7f651d..0000000
--- a/src/models/basic/unet.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import torch
-import torch.nn as nn
-
-from typing import Tuple
-from . import ConvWithNorms
-
-
-class BilinearDecoder(nn.Module):
-
-    def __init__(self, scale_factor: int):
-        super().__init__()
-        self.scale_factor = scale_factor
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return nn.functional.interpolate(x,
-                                         scale_factor=self.scale_factor,
-                                         mode="bilinear",
-                                         align_corners=False)
-
-class UpsampleSkip(nn.Module):
-
-    def __init__(self, skip_channels: int, latent_channels: int,
-                 out_channels: int):
-        super().__init__()
-        self.u1_u2 = nn.Sequential(
-            nn.Conv2d(skip_channels, latent_channels, 1, 1, 0),
-            BilinearDecoder(2))
-        self.u3 = nn.Conv2d(latent_channels, latent_channels, 1, 1, 0)
-        self.u4_u5 = nn.Sequential(
-            nn.Conv2d(2 * latent_channels, out_channels, 3, 1, 1),
-            nn.Conv2d(out_channels, out_channels, 3, 1, 1))
-
-    def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-        u2_res = self.u1_u2(a)
-        u3_res = self.u3(b)
-        u5_res = self.u4_u5(torch.cat([u2_res, u3_res], dim=1))
-        return u5_res
-
-
-class FastFlow3DUNet(nn.Module):
-    """
-    Standard UNet with a few modifications:
-     - Uses Bilinear interpolation instead of transposed convolutions
-    """
-
-    def __init__(self) -> None:
-        super().__init__()
-
-        self.encoder_step_1 = nn.Sequential(ConvWithNorms(32, 64, 3, 2, 1),
-                                            ConvWithNorms(64, 64, 3, 1, 1),
-                                            ConvWithNorms(64, 64, 3, 1, 1),
-                                            ConvWithNorms(64, 64, 3, 1, 1))
-        self.encoder_step_2 = nn.Sequential(ConvWithNorms(64, 128, 3, 2, 1),
-                                            ConvWithNorms(128, 128, 3, 1, 1),
-                                            ConvWithNorms(128, 128, 3, 1, 1),
-                                            ConvWithNorms(128, 128, 3, 1, 1),
-                                            ConvWithNorms(128, 128, 3, 1, 1),
-                                            ConvWithNorms(128, 128, 3, 1, 1))
-        self.encoder_step_3 = nn.Sequential(ConvWithNorms(128, 256, 3, 2, 1),
-                                            ConvWithNorms(256, 256, 3, 1, 1),
-                                            ConvWithNorms(256, 256, 3, 1, 1),
-                                            ConvWithNorms(256, 256, 3, 1, 1),
-                                            ConvWithNorms(256, 256, 3, 1, 1),
-                                            ConvWithNorms(256, 256, 3, 1, 1))
-        self.decoder_step1 = UpsampleSkip(512, 256, 256)
-        self.decoder_step2 = UpsampleSkip(256, 128, 128)
-        self.decoder_step3 = UpsampleSkip(128, 64, 64)
-        self.decoder_step4 = nn.Conv2d(64, 64, 3, 1, 1)
-
-    def forward(self, pc0_B: torch.Tensor,
-                pc1_B: torch.Tensor) -> torch.Tensor:
-
-        expected_channels = 32
-        assert pc0_B.shape[
-            1] == expected_channels, f"Expected {expected_channels} channels, got {pc0_B.shape[1]}"
-        assert pc1_B.shape[
-            1] == expected_channels, f"Expected {expected_channels} channels, got {pc1_B.shape[1]}"
-
-        pc0_F = self.encoder_step_1(pc0_B)
-        pc0_L = self.encoder_step_2(pc0_F)
-        pc0_R = self.encoder_step_3(pc0_L)
-
-        pc1_F = self.encoder_step_1(pc1_B)
-        pc1_L = self.encoder_step_2(pc1_F)
-        pc1_R = self.encoder_step_3(pc1_L)
-
-        Rstar = torch.cat([pc0_R, pc1_R],
-                          dim=1)  # torch.Size([1, 512, 64, 64])
-        Lstar = torch.cat([pc0_L, pc1_L],
-                          dim=1)  # torch.Size([1, 256, 128, 128])
-        Fstar = torch.cat([pc0_F, pc1_F],
-                          dim=1)  # torch.Size([1, 128, 256, 256])
-        Bstar = torch.cat([pc0_B, pc1_B],
-                          dim=1)  # torch.Size([1, 64, 512, 512])
-
-        S = self.decoder_step1(Rstar, Lstar)
-        T = self.decoder_step2(S, Fstar)
-        U = self.decoder_step3(T, Bstar)
-        V = self.decoder_step4(U)
-
-        return V
\ No newline at end of file
diff --git a/src/models/deflow.py b/src/models/deflow.py
deleted file mode 100644
index 1a4a8c3..0000000
--- a/src/models/deflow.py
+++ /dev/null
@@ -1,113 +0,0 @@
-
-"""
-# Created: 2023-07-18 15:08
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-#
-# This file is part of DeFlow (https://github.com/KTH-RPL/DeFlow).
-# If you find this repo helpful, please cite the respective publication as 
-# listed on the above website.
-"""
-
-import torch.nn as nn
-import dztimer, torch
-
-from .basic.unet import FastFlow3DUNet
-from .basic.encoder import DynamicEmbedder
-from .basic.decoder import LinearDecoder, ConvGRUDecoder
-from .basic import cal_pose0to1
-
-class DeFlow(nn.Module):
-    def __init__(self, voxel_size = [0.2, 0.2, 6],
-                 point_cloud_range = [-51.2, -51.2, -3, 51.2, 51.2, 3],
-                 grid_feature_size = [512, 512],
-                 decoder_option = "gru",
-                 num_iters = 4):
-        super().__init__()
-        self.embedder = DynamicEmbedder(voxel_size=voxel_size,
-                                        pseudo_image_dims=grid_feature_size,
-                                        point_cloud_range=point_cloud_range,
-                                        feat_channels=32)
-        
-        self.backbone = FastFlow3DUNet()
-        if decoder_option == "gru":
-            self.head = ConvGRUDecoder(num_iters = num_iters)
-        elif decoder_option == "linear":
-            self.head = LinearDecoder()
-
-        self.timer = dztimer.Timing()
-        self.timer.start("Total")
-
-    def load_from_checkpoint(self, ckpt_path):
-        ckpt = torch.load(ckpt_path, map_location="cpu")["state_dict"]
-        state_dict = {
-            k[len("model.") :]: v for k, v in ckpt.items() if k.startswith("model.")
-        }
-        print("\nLoading... model weight from: ", ckpt_path, "\n")
-        return self.load_state_dict(state_dict=state_dict, strict=False)
-
-    def forward(self, batch):
-        """
-        input: using the batch from dataloader, which is a dict
-               Detail: [pc0, pc1, pose0, pose1]
-        output: the predicted flow, pose_flow, and the valid point index of pc0
-        """
-        self.timer[0].start("Data Preprocess")
-        batch_sizes = len(batch["pose0"])
-
-        pose_flows = []
-        transform_pc0s = []
-        for batch_id in range(batch_sizes):
-            selected_pc0 = batch["pc0"][batch_id]
-            self.timer[0][0].start("pose")
-            with torch.no_grad():
-                if 'ego_motion' in batch:
-                    pose_0to1 = batch['ego_motion'][batch_id]
-                else:
-                    pose_0to1 = cal_pose0to1(batch["pose0"][batch_id], batch["pose1"][batch_id])
-            self.timer[0][0].stop()
-            
-            self.timer[0][1].start("transform")
-            # transform selected_pc0 to pc1
-            transform_pc0 = selected_pc0 @ pose_0to1[:3, :3].T + pose_0to1[:3, 3]
-            self.timer[0][1].stop()
-            pose_flows.append(transform_pc0 - selected_pc0)
-            transform_pc0s.append(transform_pc0)
-
-        pc0s = torch.stack(transform_pc0s, dim=0)
-        pc1s = batch["pc1"]
-        self.timer[0].stop()
-
-        self.timer[1].start("Voxelization")
-        pc0_before_pseudoimages, pc0_voxel_infos_lst = self.embedder(pc0s)
-        pc1_before_pseudoimages, pc1_voxel_infos_lst = self.embedder(pc1s)
-        self.timer[1].stop()
-
-        self.timer[2].start("Encoder")
-        grid_flow_pseudoimage = self.backbone(pc0_before_pseudoimages,
-                                            pc1_before_pseudoimages)
-        self.timer[2].stop()
-
-        self.timer[3].start("Decoder")
-        flows = self.head(
-            torch.cat((pc0_before_pseudoimages, pc1_before_pseudoimages),
-                    dim=1), grid_flow_pseudoimage, pc0_voxel_infos_lst)
-        self.timer[3].stop()
-
-        pc0_points_lst = [e["points"] for e in pc0_voxel_infos_lst]
-        pc1_points_lst = [e["points"] for e in pc1_voxel_infos_lst]
-
-        pc0_valid_point_idxes = [e["point_idxes"] for e in pc0_voxel_infos_lst]
-        pc1_valid_point_idxes = [e["point_idxes"] for e in pc1_voxel_infos_lst]
-
-        model_res = {
-            "flow": flows,
-            'pose_flow': pose_flows,
-
-            "pc0_valid_point_idxes": pc0_valid_point_idxes,
-            "pc0_points_lst": pc0_points_lst,
-            
-            "pc1_valid_point_idxes": pc1_valid_point_idxes,
-            "pc1_points_lst": pc1_points_lst,
-        }
-        return model_res
\ No newline at end of file
diff --git a/src/models/fastflow3d.py b/src/models/fastflow3d.py
deleted file mode 100644
index 6fe68c0..0000000
--- a/src/models/fastflow3d.py
+++ /dev/null
@@ -1,133 +0,0 @@
-
-"""
-This file is directly copied from: 
-https://github.com/kylevedder/zeroflow/blob/master/models/fast_flow_3d.py
-
-with slightly modification to have unified format with all benchmark.
-"""
-
-import dztimer, torch
-import torch.nn as nn
-
-from .basic.unet import FastFlow3DUNet
-from .basic.encoder import DynamicEmbedder
-from .basic.decoder import LinearDecoder
-from .basic import cal_pose0to1
-
-
-class FastFlow3D(nn.Module):
-    def __init__(self, voxel_size = [0.2, 0.2, 6],
-                 point_cloud_range = [-51.2, -51.2, -3, 51.2, 51.2, 3],
-                 grid_feature_size = [512, 512]):
-        super().__init__()
-
-        self.embedder = DynamicEmbedder(voxel_size=voxel_size,
-                                        pseudo_image_dims=grid_feature_size,
-                                        point_cloud_range=point_cloud_range,
-                                        feat_channels=32)
-        self.backbone = FastFlow3DUNet()
-        self.head = LinearDecoder()
-
-        self.timer = dztimer.Timing()
-        self.timer.start("Total")
-        
-    def load_from_checkpoint(self, ckpt_path):
-        ckpt = torch.load(ckpt_path, map_location="cpu")["state_dict"]
-        state_dict = {
-            k[len("model.") :]: v for k, v in ckpt.items() if k.startswith("model.")
-        }
-        print("\nLoading... model weight from: ", ckpt_path, "\n")
-        return self.load_state_dict(state_dict=state_dict, strict=False)
-    
-    def _model_forward(self, pc0s, pc1s):
-
-        pc0_before_pseudoimages, pc0_voxel_infos_lst = self.embedder(pc0s)
-        pc1_before_pseudoimages, pc1_voxel_infos_lst = self.embedder(pc1s)
-
-        grid_flow_pseudoimage = self.backbone(pc0_before_pseudoimages,
-                                            pc1_before_pseudoimages)
-        flows = self.head(
-            torch.cat((pc0_before_pseudoimages, pc1_before_pseudoimages),
-                    dim=1), grid_flow_pseudoimage, pc0_voxel_infos_lst)
-
-        pc0_points_lst = [e["points"] for e in pc0_voxel_infos_lst]
-        pc0_valid_point_idxes = [e["point_idxes"] for e in pc0_voxel_infos_lst]
-        pc1_points_lst = [e["points"] for e in pc1_voxel_infos_lst]
-        pc1_valid_point_idxes = [e["point_idxes"] for e in pc1_voxel_infos_lst]
-
-        pc0_warped_pc1_points_lst = [
-            pc0_points + flow
-            for pc0_points, flow in zip(pc0_points_lst, flows)
-        ]
-
-        return {
-            "flow": flows,
-            "pc0_points_lst": pc0_points_lst,
-            "pc0_warped_pc1_points_lst": pc0_warped_pc1_points_lst,
-            "pc0_valid_point_idxes": pc0_valid_point_idxes,
-            "pc1_points_lst": pc1_points_lst,
-            "pc1_valid_point_idxes": pc1_valid_point_idxes
-        }
-
-    def forward(self, batch,
-                compute_cycle=False,
-                compute_symmetry_x=False,
-                compute_symmetry_y=False):
-
-        self.timer[0].start("Data Preprocess")
-        batch_sizes = len(batch["pose0"])
-
-        pose_flows = []
-        transform_pc0s = []
-        for batch_id in range(batch_sizes):
-            selected_pc0 = batch["pc0"][batch_id]
-            self.timer[0][0].start("pose")
-            pose_0to1 = cal_pose0to1(batch["pose0"][batch_id], batch["pose1"][batch_id])
-            self.timer[0][0].stop()
-            
-            self.timer[0][1].start("transform")
-            # transform selected_pc0 to pc1
-            transform_pc0 = selected_pc0 @ pose_0to1[:3, :3].T + pose_0to1[:3, 3]
-            self.timer[0][1].stop()
-            pose_flows.append(transform_pc0 - selected_pc0)
-            transform_pc0s.append(transform_pc0)
-
-        pc0s = torch.stack(transform_pc0s, dim=0)
-        pc1s = batch["pc1"]
-        self.timer[0].stop()
-
-        model_res = self._model_forward(pc0s, pc1s)
-
-        ret_dict = model_res
-        ret_dict["pose_flow"] = pose_flows
-        if compute_cycle:
-            # The warped pointcloud, original pointcloud should be the input to the model
-            pc0_warped_pc1_points_lst = model_res["pc0_warped_pc1_points_lst"]
-            pc0_points_lst = model_res["pc0_points_lst"]
-            # Some of the warped points may be outside the pseudoimage range, causing them to be clipped.
-            # When we compute this reverse flow, we need to solve for the original points that were warped to the clipped points.
-            backward_model_res = self._model_forward(pc0_warped_pc1_points_lst,
-                                                     pc0_points_lst)
-            # model_res["reverse_flow"] = backward_model_res["flow"]
-            # model_res[
-            #     "flow_valid_point_idxes_for_reverse_flow"] = backward_model_res[
-            #         "pc0_valid_point_idxes"]
-            ret_dict["backward"] = backward_model_res
-
-        if compute_symmetry_x:
-            pc0s_sym = pc0s.clone()
-            pc0s_sym[:, :, 0] *= -1
-            pc1s_sym = pc1s.clone()
-            pc1s_sym[:, :, 0] *= -1
-            model_res_sym = self._model_forward(pc0s_sym, pc1s_sym)
-            ret_dict["symmetry_x"] = model_res_sym
-
-        if compute_symmetry_y:
-            pc0s_sym = pc0s.clone()
-            pc0s_sym[:, :, 1] *= -1
-            pc1s_sym = pc1s.clone()
-            pc1s_sym[:, :, 1] *= -1
-            model_res_sym = self._model_forward(pc0s_sym, pc1s_sym)
-            ret_dict["symmetry_y"] = model_res_sym
-
-        return ret_dict
\ No newline at end of file
diff --git a/src/trainer.py b/src/trainer.py
deleted file mode 100644
index 97dc1e7..0000000
--- a/src/trainer.py
+++ /dev/null
@@ -1,314 +0,0 @@
-"""
-
-# Created: 2023-11-05 10:00
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-#
-# This file is part of DeFlow (https://github.com/KTH-RPL/DeFlow).
-# If you find this repo helpful, please cite the respective publication as 
-# listed on the above website.
-# 
-# Description: Model Wrapper for Pytorch Lightning
-
-"""
-
-import numpy as np
-import torch
-import torch.optim as optim
-from pathlib import Path
-
-from lightning import LightningModule
-from hydra.utils import instantiate
-from omegaconf import OmegaConf,open_dict
-
-import os, sys, time, h5py
-BASE_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '..' ))
-sys.path.append(BASE_DIR)
-from src.utils.mics import import_func, weights_init, zip_res
-from src.utils.av2_eval import write_output_file
-from src.models.basic import cal_pose0to1
-from src.utils.eval_metric import OfficialMetrics, evaluate_leaderboard, evaluate_leaderboard_v2
-
-# debugging tools
-# import faulthandler
-# faulthandler.enable()
-
-torch.set_float32_matmul_precision('medium')
-class ModelWrapper(LightningModule):
-    def __init__(self, cfg, eval=False):
-        super().__init__()
-
-        # set grid size
-        if ('voxel_size' in cfg.model.target) and ('point_cloud_range' in cfg.model.target) and not eval and 'point_cloud_range' in cfg:
-            OmegaConf.set_struct(cfg.model.target, True)
-            with open_dict(cfg.model.target):
-                cfg.model.target['grid_feature_size'] = \
-                    [abs(int((cfg.point_cloud_range[0] - cfg.point_cloud_range[3]) / cfg.voxel_size[0])),
-                    abs(int((cfg.point_cloud_range[1] - cfg.point_cloud_range[4]) / cfg.voxel_size[1])),
-                    abs(int((cfg.point_cloud_range[2] - cfg.point_cloud_range[5]) / cfg.voxel_size[2]))]
-        else:
-            with open_dict(cfg.model.target):
-                cfg.model.target['grid_feature_size'] = \
-                    [abs(int((cfg.model.target.point_cloud_range[0] - cfg.model.target.point_cloud_range[3]) / cfg.model.target.voxel_size[0])),
-                    abs(int((cfg.model.target.point_cloud_range[1] - cfg.model.target.point_cloud_range[4]) / cfg.model.target.voxel_size[1])),
-                    abs(int((cfg.model.target.point_cloud_range[2] - cfg.model.target.point_cloud_range[5]) / cfg.model.target.voxel_size[2]))]
-        
-        self.model = instantiate(cfg.model.target)
-        self.model.apply(weights_init)
-        
-        self.loss_fn = import_func("src.lossfuncs."+cfg.loss_fn) if 'loss_fn' in cfg else None
-        self.add_seloss = cfg.add_seloss if 'add_seloss' in cfg else None
-        self.cfg_loss_name = cfg.loss_fn if 'loss_fn' in cfg else None
-        
-        self.batch_size = int(cfg.batch_size) if 'batch_size' in cfg else 1
-        self.lr = cfg.lr if 'lr' in cfg else None
-        self.epochs = cfg.epochs if 'epochs' in cfg else None
-        
-        self.metrics = OfficialMetrics()
-
-        self.load_checkpoint_path = cfg.checkpoint if 'checkpoint' in cfg else None
-
-
-        self.leaderboard_version = cfg.leaderboard_version if 'leaderboard_version' in cfg else 1
-        # NOTE(Qingwen): since we have seflow version which is unsupervised, we need to set the flag to false.
-        self.supervised_flag = cfg.supervised_flag if 'supervised_flag' in cfg else True
-        self.save_res = False
-        if 'av2_mode' in cfg:
-            self.av2_mode = cfg.av2_mode
-            self.save_res = cfg.save_res if 'save_res' in cfg else False
-            
-            if self.save_res or self.av2_mode == 'test':
-                self.save_res_path = Path(cfg.dataset_path).parent / "results" / cfg.output
-                os.makedirs(self.save_res_path, exist_ok=True)
-                print(f"We are in {cfg.av2_mode}, results will be saved in: {self.save_res_path} with version: {self.leaderboard_version} format for online leaderboard.")
-        else:
-            self.av2_mode = None
-            if 'pretrained_weights' in cfg:
-                if cfg.pretrained_weights is not None:
-                    self.model.load_from_checkpoint(cfg.pretrained_weights)
-
-        self.dataset_path = cfg.dataset_path if 'dataset_path' in cfg else None
-        self.vis_name = cfg.res_name if 'res_name' in cfg else 'default'
-        self.save_hyperparameters()
-
-    def training_step(self, batch, batch_idx):
-        self.model.timer[4].start("One Scan in model")
-        res_dict = self.model(batch)
-        self.model.timer[4].stop()
-
-        self.model.timer[5].start("Loss")
-        # compute loss
-        total_loss = 0.0
-
-        if self.cfg_loss_name in ['seflowLoss']:
-            loss_items, weights = zip(*[(key, weight) for key, weight in self.add_seloss.items()])
-            loss_logger = {'chamfer_dis': 0.0, 'dynamic_chamfer_dis': 0.0, 'static_flow_loss': 0.0, 'cluster_based_pc0pc1': 0.0}
-        else:
-            loss_items, weights = ['loss'], [1.0]
-            loss_logger = {'loss': 0.0}
-
-        pc0_valid_idx = res_dict['pc0_valid_point_idxes'] # since padding
-        pc1_valid_idx = res_dict['pc1_valid_point_idxes'] # since padding
-        if 'pc0_points_lst' in res_dict and 'pc1_points_lst' in res_dict:
-            pc0_points_lst = res_dict['pc0_points_lst']
-            pc1_points_lst = res_dict['pc1_points_lst']
-        
-        batch_sizes = len(batch["pose0"])
-        pose_flows = res_dict['pose_flow']
-        est_flow = res_dict['flow']
-        
-        for batch_id in range(batch_sizes):
-            pc0_valid_from_pc2res = pc0_valid_idx[batch_id]
-            pc1_valid_from_pc2res = pc1_valid_idx[batch_id]
-            pose_flow_ = pose_flows[batch_id][pc0_valid_from_pc2res]
-
-            dict2loss = {'est_flow': est_flow[batch_id], 
-                        'gt_flow': None if 'flow' not in batch else batch['flow'][batch_id][pc0_valid_from_pc2res] - pose_flow_, 
-                        'gt_classes': None if 'flow_category_indices' not in batch else batch['flow_category_indices'][batch_id][pc0_valid_from_pc2res]}
-            
-            if 'pc0_dynamic' in batch:
-                dict2loss['pc0_labels'] = batch['pc0_dynamic'][batch_id][pc0_valid_from_pc2res]
-                dict2loss['pc1_labels'] = batch['pc1_dynamic'][batch_id][pc1_valid_from_pc2res]
-
-            # different methods may don't have this in the res_dict
-            if 'pc0_points_lst' in res_dict and 'pc1_points_lst' in res_dict:
-                dict2loss['pc0'] = pc0_points_lst[batch_id]
-                dict2loss['pc1'] = pc1_points_lst[batch_id]
-
-            res_loss = self.loss_fn(dict2loss)
-            for i, loss_name in enumerate(loss_items):
-                total_loss += weights[i] * res_loss[loss_name]
-            for key in res_loss:
-                loss_logger[key] += res_loss[key]
-
-        self.log("trainer/loss", total_loss/batch_sizes, sync_dist=True, batch_size=self.batch_size, prog_bar=True)
-        if self.add_seloss is not None:
-            for key in loss_logger:
-                self.log(f"trainer/{key}", loss_logger[key]/batch_sizes, sync_dist=True, batch_size=self.batch_size)
-        self.model.timer[5].stop()
-
-        # NOTE (Qingwen): if you want to view the detail breakdown of time cost
-        # self.model.timer.print(random_colors=False, bold=False)
-        return total_loss
-
-    def train_validation_step_(self, batch, res_dict):
-        # means there are ground truth flow so we can evaluate the EPE-3 Way metric
-        if batch['flow'][0].shape[0] > 0:
-            pose_flows = res_dict['pose_flow']
-            for batch_id, gt_flow in enumerate(batch["flow"]):
-                valid_from_pc2res = res_dict['pc0_valid_point_idxes'][batch_id]
-                pose_flow = pose_flows[batch_id][valid_from_pc2res]
-
-                final_flow_ = pose_flow.clone() + res_dict['flow'][batch_id]
-                v1_dict= evaluate_leaderboard(final_flow_, pose_flow, batch['pc0'][batch_id][valid_from_pc2res], gt_flow[valid_from_pc2res], \
-                                           batch['flow_is_valid'][batch_id][valid_from_pc2res], batch['flow_category_indices'][batch_id][valid_from_pc2res])
-                v2_dict = evaluate_leaderboard_v2(final_flow_, pose_flow, batch['pc0'][batch_id][valid_from_pc2res], gt_flow[valid_from_pc2res], \
-                                        batch['flow_is_valid'][batch_id][valid_from_pc2res], batch['flow_category_indices'][batch_id][valid_from_pc2res])
-                
-                self.metrics.step(v1_dict, v2_dict)
-        else:
-            pass
-
-    def configure_optimizers(self):
-        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
-        return optimizer
-
-    def on_train_epoch_start(self):
-        self.time_start_train_epoch = time.time()
-
-    def on_train_epoch_end(self):
-        self.log("pre_epoch_cost (mins)", (time.time()-self.time_start_train_epoch)/60.0, on_step=False, on_epoch=True, sync_dist=True)
-    
-    def on_validation_epoch_end(self):
-        self.model.timer.print(random_colors=False, bold=False)
-
-        if self.av2_mode == 'test':
-            print(f"\nModel: {self.model.__class__.__name__}, Checkpoint from: {self.load_checkpoint_path}")
-            print(f"Test results saved in: {self.save_res_path}, Please run submit command and upload to online leaderboard for results.")
-            if self.leaderboard_version == 1:
-                print(f"\nevalai challenge 2010 phase 4018 submit --file {self.save_res_path}.zip --large --private\n")
-            elif self.leaderboard_version == 2:
-                print(f"\nevalai challenge 2210 phase 4396 submit --file {self.save_res_path}.zip --large --private\n")
-            else:
-                print(f"Please check the leaderboard version in the config file. We only support version 1 and 2.")
-            output_file = zip_res(self.save_res_path, leaderboard_version=self.leaderboard_version, is_supervised = self.supervised_flag, output_file=self.save_res_path.as_posix() + ".zip")
-            # wandb.log_artifact(output_file)
-            return
-        
-        if self.av2_mode == 'val':
-            print(f"\nModel: {self.model.__class__.__name__}, Checkpoint from: {self.load_checkpoint_path}")
-            print(f"More details parameters and training status are in checkpoints")        
-
-        self.metrics.normalize()
-
-        # wandb log things:
-        for key in self.metrics.bucketed:
-            for type_ in 'Static', 'Dynamic':
-                self.log(f"val/{type_}/{key}", self.metrics.bucketed[key][type_], sync_dist=True)
-        for key in self.metrics.epe_3way:
-            self.log(f"val/{key}", self.metrics.epe_3way[key], sync_dist=True)
-        
-        self.metrics.print()
-
-        self.metrics = OfficialMetrics()
-
-        if self.save_res:
-            print(f"We already write the flow_est into the dataset, please run following commend to visualize the flow. Copy and paste it to your terminal:")
-            print(f"python tools/visualization.py --res_name '{self.vis_name}' --data_dir {self.dataset_path}")
-            print(f"Enjoy! ^v^ ------ \n")
-        
-    def eval_only_step_(self, batch, res_dict):
-        eval_mask = batch['eval_mask'].squeeze()
-        pc0 = batch['origin_pc0']
-        pose_0to1 = cal_pose0to1(batch["pose0"], batch["pose1"])
-        transform_pc0 = pc0 @ pose_0to1[:3, :3].T + pose_0to1[:3, 3]
-        pose_flow = transform_pc0 - pc0
-
-        final_flow = pose_flow.clone()
-        if 'pc0_valid_point_idxes' in res_dict:
-            valid_from_pc2res = res_dict['pc0_valid_point_idxes']
-
-            # flow in the original pc0 coordinate
-            pred_flow = pose_flow[~batch['gm0']].clone()
-            pred_flow[valid_from_pc2res] = res_dict['flow'] + pose_flow[~batch['gm0']][valid_from_pc2res]
-            final_flow[~batch['gm0']] = pred_flow
-        else:
-            final_flow[~batch['gm0']] = res_dict['flow'] + pose_flow[~batch['gm0']]
-
-        if self.av2_mode == 'val': # since only val we have ground truth flow to eval
-            gt_flow = batch["flow"]
-            v1_dict = evaluate_leaderboard(final_flow[eval_mask], pose_flow[eval_mask], pc0[eval_mask], \
-                                       gt_flow[eval_mask], batch['flow_is_valid'][eval_mask], \
-                                       batch['flow_category_indices'][eval_mask])
-            v2_dict = evaluate_leaderboard_v2(final_flow[eval_mask], pose_flow[eval_mask], pc0[eval_mask], \
-                                    gt_flow[eval_mask], batch['flow_is_valid'][eval_mask], batch['flow_category_indices'][eval_mask])
-            
-            self.metrics.step(v1_dict, v2_dict)
-        
-        # NOTE (Qingwen): Since val and test, we will force set batch_size = 1 
-        if self.save_res or self.av2_mode == 'test': # test must save data to submit in the online leaderboard.    
-            save_pred_flow = final_flow[eval_mask, :3].cpu().detach().numpy()
-            rigid_flow = pose_flow[eval_mask, :3].cpu().detach().numpy()
-            is_dynamic = np.linalg.norm(save_pred_flow - rigid_flow, axis=1, ord=2) >= 0.05
-            sweep_uuid = (batch['scene_id'], batch['timestamp'])
-            if self.leaderboard_version == 2:
-                save_pred_flow = (final_flow - pose_flow).cpu().detach().numpy() # all points here... since 2rd version we need to save the relative flow.
-            write_output_file(save_pred_flow, is_dynamic, sweep_uuid, self.save_res_path, leaderboard_version=self.leaderboard_version)
-
-    def run_model_wo_ground_data(self, batch):
-        # NOTE (Qingwen): only needed when val or test mode, since train we will go through collate_fn to remove.
-        batch['origin_pc0'] = batch['pc0'].clone()
-        batch['pc0'] = batch['pc0'][~batch['gm0']].unsqueeze(0)
-        batch['pc1'] = batch['pc1'][~batch['gm1']].unsqueeze(0)
-        if 'pcb0' in batch:
-            batch['pcb0'] = batch['pcb0'][~batch['gmb0']].unsqueeze(0)
-        self.model.timer[12].start("One Scan")
-        res_dict = self.model(batch)
-        self.model.timer[12].stop()
-
-        # NOTE (Qingwen): Since val and test, we will force set batch_size = 1 
-        batch = {key: batch[key][0] for key in batch if len(batch[key])>0}
-        res_dict = {key: res_dict[key][0] for key in res_dict if len(res_dict[key])>0}
-        return batch, res_dict
-    
-    def validation_step(self, batch, batch_idx):
-        if self.av2_mode == 'val' or self.av2_mode == 'test':
-            batch, res_dict = self.run_model_wo_ground_data(batch)
-            self.eval_only_step_(batch, res_dict)
-        else:
-            res_dict = self.model(batch)
-            self.train_validation_step_(batch, res_dict)
-
-    def test_step(self, batch, batch_idx):
-        batch, res_dict = self.run_model_wo_ground_data(batch)
-        pc0 = batch['origin_pc0']
-        pose_0to1 = cal_pose0to1(batch["pose0"], batch["pose1"])
-        transform_pc0 = pc0 @ pose_0to1[:3, :3].T + pose_0to1[:3, 3]
-        pose_flow = transform_pc0 - pc0
-
-        final_flow = pose_flow.clone()
-        if 'pc0_valid_point_idxes' in res_dict:
-            valid_from_pc2res = res_dict['pc0_valid_point_idxes']
-
-            # flow in the original pc0 coordinate
-            pred_flow = pose_flow[~batch['gm0']].clone()
-            pred_flow[valid_from_pc2res] = pose_flow[~batch['gm0']][valid_from_pc2res] + res_dict['flow']
-
-            final_flow[~batch['gm0']] = pred_flow
-        else:
-            final_flow[~batch['gm0']] = res_dict['flow'] + pose_flow[~batch['gm0']]
-
-        # write final_flow into the dataset.
-        key = str(batch['timestamp'])
-        scene_id = batch['scene_id']
-        with h5py.File(os.path.join(self.dataset_path, f'{scene_id}.h5'), 'r+') as f:
-            if self.vis_name in f[key]:
-                del f[key][self.vis_name]
-            f[key].create_dataset(self.vis_name, data=final_flow.cpu().detach().numpy().astype(np.float32))
-
-    def on_test_epoch_end(self):
-        self.model.timer.print(random_colors=False, bold=False)
-        print(f"\n\nModel: {self.model.__class__.__name__}, Checkpoint from: {self.load_checkpoint_path}")
-        print(f"We already write the flow_est into the dataset, please run following commend to visualize the flow. Copy and paste it to your terminal:")
-        print(f"python tools/visualization.py --res_name '{self.vis_name}' --data_dir {self.dataset_path}")
-        print(f"Enjoy! ^v^ ------ \n")
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
deleted file mode 100644
index ca2f9cc..0000000
--- a/src/utils/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-class bc:
-    HEADER = '\033[95m'
-    OKBLUE = '\033[94m'
-    OKCYAN = '\033[96m'
-    OKGREEN = '\033[92m'
-    WARNING = '\033[93m'
-    FAIL = '\033[91m'
-    ENDC = '\033[0m'
-    BOLD = '\033[1m'
-    UNDERLINE = '\033[4m'
\ No newline at end of file
diff --git a/src/utils/av2_eval.py b/src/utils/av2_eval.py
deleted file mode 100644
index 3d28688..0000000
--- a/src/utils/av2_eval.py
+++ /dev/null
@@ -1,871 +0,0 @@
-import zipfile
-from collections import defaultdict
-from pathlib import Path
-from typing import (
-    Any,
-    Callable,
-    DefaultDict,
-    Dict,
-    Final,
-    List,
-    Optional,
-    Tuple,
-    Union,
-    cast,
-)
-
-import numpy as np
-import pandas as pd
-from rich.progress import track
-
-"""Constants for scene flow evaluation."""
-
-from enum import Enum, unique
-from typing import Final
-
-from av2.datasets.sensor.constants import AnnotationCategories
-
-SCENE_FLOW_DYNAMIC_THRESHOLD: Final = 0.05
-SWEEP_PAIR_TIME_DELTA: Final = 0.1
-CLOSE_DISTANCE_THRESHOLD: Final = 35.0
-
-CATEGORY_TO_INDEX: Final = {
-    **{"NONE": 0},
-    **{k.value: i + 1 for i, k in enumerate(AnnotationCategories)},
-}
-# These catagories are ignored because of labeling oddities (moving stop signs on side of school bus, etc)
-ROAD_SIGNS = [
-    "BOLLARD",
-    "CONSTRUCTION_BARREL",
-    "CONSTRUCTION_CONE",
-    "MOBILE_PEDESTRIAN_CROSSING_SIGN",
-    "SIGN",
-    "STOP_SIGN",
-    "MESSAGE_BOARD_TRAILER",
-    "TRAFFIC_LIGHT_TRAILER",
-]
-PEDESTRIAN_CATEGORIES = ["PEDESTRIAN", "STROLLER", "WHEELCHAIR", "OFFICIAL_SIGNALER"]
-WHEELED_VRU = [
-    "BICYCLE",
-    "BICYCLIST",
-    "MOTORCYCLE",
-    "MOTORCYCLIST",
-    "WHEELED_DEVICE",
-    "WHEELED_RIDER",
-]
-CAR = ["REGULAR_VEHICLE"]
-OTHER_VEHICLES = [
-    "BOX_TRUCK",
-    "LARGE_VEHICLE",
-    "RAILED_VEHICLE",
-    "TRUCK",
-    "TRUCK_CAB",
-    "VEHICULAR_TRAILER",
-    "ARTICULATED_BUS",
-    "BUS",
-    "SCHOOL_BUS",
-]
-BACKGROUND_CATEGORIES = ["NONE"] # + ROAD_SIGNS
-BUCKETED_METACATAGORIES = {
-    "BACKGROUND": BACKGROUND_CATEGORIES,
-    "CAR": CAR,
-    "PEDESTRIAN": PEDESTRIAN_CATEGORIES,
-    "WHEELED_VRU": WHEELED_VRU,
-    "OTHER_VEHICLES": OTHER_VEHICLES,
-}
-
-import av2.geometry.geometry as geometry_utils
-from av2.geometry.se3 import SE3
-from av2.utils.typing import NDArrayFloat
-from av2.utils.io import read_feather
-
-# Mapping from egovehicle time in nanoseconds to egovehicle pose.
-TimestampedCitySE3EgoPoses = Dict[int, SE3]
-
-# Mapping from sensor name to sensor pose.
-SensorPosesMapping = Dict[str, SE3]
-
-def read_ego_SE3_sensor(log_dir: Path) -> SensorPosesMapping:
-    """Read the sensor poses for the given log.
-
-    The sensor pose defines an SE3 transformation from the sensor reference frame to the egovehicle reference frame.
-    Mathematically we define this transformation as: $$ego_SE3_sensor$$.
-
-    In other words, when this transformation is applied to a set of points in the sensor reference frame, they
-    will be transformed to the egovehicle reference frame.
-
-    Example (1).
-        points_ego = ego_SE3_sensor(points_sensor) apply the SE3 transformation to points in the sensor reference frame.
-
-    Example (2).
-        sensor_SE3_ego = ego_SE3_sensor^{-1} take the inverse of the SE3 transformation.
-        points_sensor = sensor_SE3_ego(points_ego) apply the SE3 transformation to points in the ego reference frame.
-
-    Extrinsics:
-        sensor_name: Name of the sensor.
-        qw: scalar component of a quaternion.
-        qx: X-axis coefficient of a quaternion.
-        qy: Y-axis coefficient of a quaternion.
-        qz: Z-axis coefficient of a quaternion.
-        tx_m: X-axis translation component.
-        ty_m: Y-axis translation component.
-        tz_m: Z-axis translation component.
-
-    Args:
-        log_dir: Path to the log directory.
-
-    Returns:
-        Mapping from sensor name to sensor pose.
-    """
-    ego_SE3_sensor_path = Path(log_dir, "calibration", "egovehicle_SE3_sensor.feather")
-    ego_SE3_sensor = read_feather(ego_SE3_sensor_path)
-    rotations = geometry_utils.quat_to_mat(
-        ego_SE3_sensor.loc[:, ["qw", "qx", "qy", "qz"]].to_numpy()
-    )
-    translations = ego_SE3_sensor.loc[:, ["tx_m", "ty_m", "tz_m"]].to_numpy()
-    sensor_names = ego_SE3_sensor.loc[:, "sensor_name"].to_numpy()
-
-    sensor_name_to_pose: SensorPosesMapping = {
-        name: SE3(rotation=rotations[i], translation=translations[i])
-        for i, name in enumerate(sensor_names)
-    }
-    return sensor_name_to_pose
-
-@unique
-class SceneFlowMetricType(str, Enum):
-    """Scene Flow metrics."""
-
-    ACCURACY_RELAX = "ACCURACY_RELAX"
-    ACCURACY_STRICT = "ACCURACY_STRICT"
-    ANGLE_ERROR = "ANGLE_ERROR"
-    EPE = "EPE"
-
-
-@unique
-class SegmentationMetricType(str, Enum):
-    """Segmentation metrics."""
-
-    TP = "TP"
-    TN = "TN"
-    FP = "FP"
-    FN = "FN"
-
-
-@unique
-class InanimateCategories(str, Enum):
-    """Annotation categories representing inanimate objects that aren't vehicles."""
-
-    BOLLARD = "BOLLARD"
-    CONSTRUCTION_BARREL = "CONSTRUCTION_BARREL"
-    CONSTRUCTION_CONE = "CONSTRUCTION_CONE"
-    MOBILE_PEDESTRIAN_CROSSING_SIGN = "MOBILE_PEDESTRIAN_CROSSING_SIGN"
-    SIGN = "SIGN"
-    STOP_SIGN = "STOP_SIGN"
-
-
-@unique
-class LeggedCategories(str, Enum):
-    """Annotation categories representing objects that move using legs."""
-
-    ANIMAL = "ANIMAL"
-    DOG = "DOG"
-    OFFICIAL_SIGNALER = "OFFICIAL_SIGNALER"
-    PEDESTRIAN = "PEDESTRIAN"
-
-
-@unique
-class SmallVehicleCategories(str, Enum):
-    """Annotation categories representing small vehicles."""
-
-    BICYCLE = "BICYCLE"
-    BICYCLIST = "BICYCLIST"
-    MOTORCYCLE = "MOTORCYCLE"
-    MOTORCYCLIST = "MOTORCYCLIST"
-    STROLLER = "STROLLER"
-    WHEELCHAIR = "WHEELCHAIR"
-    WHEELED_DEVICE = "WHEELED_DEVICE"
-    WHEELED_RIDER = "WHEELED_RIDER"
-
-
-@unique
-class VehicleCategories(str, Enum):
-    """Annotation categories representing regular vehicles."""
-
-    ARTICULATED_BUS = "ARTICULATED_BUS"
-    BOX_TRUCK = "BOX_TRUCK"
-    BUS = "BUS"
-    LARGE_VEHICLE = "LARGE_VEHICLE"
-    MESSAGE_BOARD_TRAILER = "MESSAGE_BOARD_TRAILER"
-    RAILED_VEHICLE = "RAILED_VEHICLE"
-    REGULAR_VEHICLE = "REGULAR_VEHICLE"
-    SCHOOL_BUS = "SCHOOL_BUS"
-    TRAFFIC_LIGHT_TRAILER = "TRAFFIC_LIGHT_TRAILER"
-    TRUCK = "TRUCK"
-    TRUCK_CAB = "TRUCK_CAB"
-    VEHICULAR_TRAILER = "VEHICULAR_TRAILER"
-
-
-@unique
-class MetricBreakdownCategories(str, Enum):
-    """Meta-categories for the scene flow task."""
-
-    ALL = "All"
-    BACKGROUND = "Background"
-    FOREGROUND = "Foreground"
-
-
-NO_CLASS_BREAKDOWN: Final = {MetricBreakdownCategories.ALL: list(range(31))}
-FOREGROUND_BACKGROUND_BREAKDOWN: Final = {
-    MetricBreakdownCategories.BACKGROUND: [0],
-    MetricBreakdownCategories.FOREGROUND: [
-        CATEGORY_TO_INDEX[k.value]
-        for k in (
-            list(InanimateCategories)
-            + list(LeggedCategories)
-            + list(SmallVehicleCategories)
-            + list(VehicleCategories)
-        )
-    ],
-}
-
-FLOW_COLUMNS: Final = ("flow_tx_m", "flow_ty_m", "flow_tz_m")
-
-
-from av2.utils.typing import NDArrayBool, NDArrayFloat, NDArrayInt
-
-ACCURACY_RELAX_DISTANCE_THRESHOLD: Final = 0.1
-ACCURACY_STRICT_DISTANCE_THRESHOLD: Final = 0.05
-NO_FMT_INDICES: Final = ("Background", "Dynamic")
-EPS: Final = 1e-10
-
-def compute_end_point_error(dts: NDArrayFloat, gts: NDArrayFloat) -> NDArrayFloat:
-    """Compute the end-point error between predictions and ground truth.
-
-    Args:
-        dts: (N,3) Array containing predicted flows.
-        gts: (N,3) Array containing ground truth flows.
-
-    Returns:
-        The point-wise end-point error.
-    """
-    end_point_error: NDArrayFloat = np.linalg.norm(dts - gts, axis=-1).astype(
-        np.float64
-    )
-    return end_point_error
-
-
-def compute_accuracy(
-    dts: NDArrayFloat, gts: NDArrayFloat, distance_threshold: float
-) -> NDArrayFloat:
-    """Compute the percent of inliers for a given threshold for a set of prediction and ground truth vectors.
-
-    Args:
-        dts: (N,3) Array containing predicted flows.
-        gts: (N,3) Array containing ground truth flows.
-        distance_threshold: Distance threshold for classifying inliers.
-
-    Returns:
-        The pointwise inlier assignments.
-    """
-    l2_norm = np.linalg.norm(dts - gts, axis=-1)
-    gts_norm = np.linalg.norm(gts, axis=-1)
-    relative_error = np.divide(l2_norm, gts_norm + EPS)
-    abs_error_inlier = np.less(l2_norm, distance_threshold).astype(bool)
-    relative_error_inlier = np.less(relative_error, distance_threshold).astype(bool)
-    accuracy: NDArrayFloat = np.logical_or(
-        abs_error_inlier, relative_error_inlier
-    ).astype(np.float64)
-    return accuracy
-
-
-def compute_accuracy_strict(dts: NDArrayFloat, gts: NDArrayFloat) -> NDArrayFloat:
-    """Compute the accuracy with a 0.05 threshold.
-
-    Args:
-        dts: (N,3) Array containing predicted flows.
-        gts: (N,3) Array containing ground truth flows.
-
-    Returns:
-        The pointwise inlier assignments at a 0.05 threshold
-    """
-    return compute_accuracy(dts, gts, ACCURACY_STRICT_DISTANCE_THRESHOLD)
-
-
-def compute_accuracy_relax(dts: NDArrayFloat, gts: NDArrayFloat) -> NDArrayFloat:
-    """Compute the accuracy with a 0.1 threshold.
-
-    Args:
-        dts: (N,3) Array containing predicted flows.
-        gts: (N,3) Array containing ground truth flows.
-
-    Returns:
-        The pointwise inlier assignments at a 0.1 threshold.
-    """
-    return compute_accuracy(dts, gts, ACCURACY_RELAX_DISTANCE_THRESHOLD)
-
-
-def compute_angle_error(dts: NDArrayFloat, gts: NDArrayFloat) -> NDArrayFloat:
-    """Compute the angle error in space-time between the prediced and ground truth flow vectors.
-
-    Args:
-        dts: (N,3) Array containing predicted flows.
-        gts: (N,3) Array containing ground truth flows.
-
-    Returns:
-        The pointwise angle errors in space-time.
-    """
-    # Convert the 3D flow vectors to 4D space-time vectors.
-    dts_space_time = np.pad(
-        dts, ((0, 0), (0, 1)), constant_values=SWEEP_PAIR_TIME_DELTA
-    )
-    gts_space_time = np.pad(
-        gts, ((0, 0), (0, 1)), constant_values=SWEEP_PAIR_TIME_DELTA
-    )
-
-    dts_space_time_norm = np.linalg.norm(dts_space_time, axis=-1, keepdims=True)
-    gts_space_time_norm = np.linalg.norm(gts_space_time, axis=-1, keepdims=True)
-    unit_dts = dts_space_time / dts_space_time_norm
-    unit_gts = gts_space_time / gts_space_time_norm
-
-    dot_product = np.einsum("bd,bd->b", unit_dts, unit_gts)
-
-    # Floating point errors can cause `dot_product` to be slightly greater than 1 or less than -1.
-    clipped_dot_product = np.clip(dot_product, -1.0, 1.0)
-    angle_error: NDArrayFloat = np.arccos(clipped_dot_product).astype(np.float64)
-    return angle_error
-
-
-def compute_true_positives(dts: NDArrayBool, gts: NDArrayBool) -> int:
-    """Compute true positive count.
-
-    Args:
-        dts: (N,) Array containing predicted dynamic segmentation.
-        gts: (N,) Array containing ground truth dynamic segmentation.
-
-    Returns:
-        The number of true positive classifications.
-    """
-    return int(np.logical_and(dts, gts).sum())
-
-
-def compute_true_negatives(dts: NDArrayBool, gts: NDArrayBool) -> int:
-    """Compute true negative count.
-
-    Args:
-        dts: (N,) Array containing predicted dynamic segmentation.
-        gts: (N,) Array containing ground truth dynamic segmentation.
-
-    Returns:
-        The number of true negative classifications.
-    """
-    return int(np.logical_and(~dts, ~gts).sum())
-
-
-def compute_false_positives(dts: NDArrayBool, gts: NDArrayBool) -> int:
-    """Compute false positive count.
-
-    Args:
-        dts: (N,) Array containing predicted dynamic segmentation.
-        gts: (N,) Array containing ground truth dynamic segmentation.
-
-    Returns:
-        The number of false positive classifications.
-    """
-    return int(np.logical_and(dts, ~gts).sum())
-
-
-def compute_false_negatives(dts: NDArrayBool, gts: NDArrayBool) -> int:
-    """Compute false negative count.
-
-    Args:
-        dts: (N,) Array containing predicted dynamic segmentation.
-        gts: (N,) Array containing ground truth dynamic segmentation.
-
-    Returns:
-        The number of false negative classifications
-    """
-    return int(np.logical_and(~dts, gts).sum())
-
-
-def compute_scene_flow_metrics(
-    dts: NDArrayFloat, gts: NDArrayFloat, scene_flow_metric_type: SceneFlowMetricType
-) -> NDArrayFloat:
-    """Compute scene flow metrics.
-
-    Args:
-        dts: (N,3) Array containing predicted flows.
-        gts: (N,3) Array containing ground truth flows.
-        scene_flow_metric_type: Scene flow metric type.
-
-    Returns:
-        Scene flow metric corresponding to `scene_flow_metric_type`.
-
-    Raises:
-        NotImplementedError: If the `scene_flow_metric_type` is not implemented.
-    """
-    if scene_flow_metric_type == SceneFlowMetricType.ACCURACY_RELAX:
-        return compute_accuracy_relax(dts, gts)
-    elif scene_flow_metric_type == SceneFlowMetricType.ACCURACY_STRICT:
-        return compute_accuracy_strict(dts, gts)
-    elif scene_flow_metric_type == SceneFlowMetricType.ANGLE_ERROR:
-        return compute_angle_error(dts, gts)
-    elif scene_flow_metric_type == SceneFlowMetricType.EPE:
-        return compute_end_point_error(dts, gts)
-    else:
-        raise NotImplementedError(
-            f"The scene flow metric type {scene_flow_metric_type} is not implemented!"
-        )
-
-
-def compute_segmentation_metrics(
-    dts: NDArrayBool, gts: NDArrayBool, segmentation_metric_type: SegmentationMetricType
-) -> int:
-    """Compute segmentation metrics.
-
-    Args:
-        dts: (N,) Array containing predicted dynamic segmentation.
-        gts: (N,) Array containing ground truth dynamic segmentation.
-        segmentation_metric_type: Segmentation metric type.
-
-    Returns:
-        Segmentation metric corresponding to `segmentation_metric_type`.
-
-    Raises:
-        NotImplementedError: If the `segmentation_metric_type` is not implemented.
-    """
-    if segmentation_metric_type == SegmentationMetricType.TP:
-        return compute_true_positives(dts, gts)
-    elif segmentation_metric_type == SegmentationMetricType.TN:
-        return compute_true_negatives(dts, gts)
-    elif segmentation_metric_type == SegmentationMetricType.FP:
-        return compute_false_positives(dts, gts)
-    elif segmentation_metric_type == SegmentationMetricType.FN:
-        return compute_false_negatives(dts, gts)
-    else:
-        raise NotImplementedError(
-            f"The segmentation metric type {segmentation_metric_type} is not implemented!"
-        )
-
-EPS: Final = 1e-6
-def compute_epe(res_dict, indices, eps=1e-8):
-    epe_sum = 0
-    count_sum = 0
-    for index in indices:
-        count = res_dict['Count'][index]
-        if count != 0:
-            epe_sum += res_dict['EPE'][index] * count
-            count_sum += count
-    return epe_sum / (count_sum + eps) if count_sum != 0 else 0.0
-
-def compute_metrics(
-    pred_flow: NDArrayFloat,
-    pred_dynamic: NDArrayBool,
-    gts: NDArrayFloat,
-    category_indices: NDArrayInt,
-    is_dynamic: NDArrayBool,
-    is_close: NDArrayBool,
-    is_valid: NDArrayBool,
-    # metric_categories: Dict[MetricBreakdownCategories, List[int]],
-) -> Dict[str, List[Any]]:
-    """Compute all the metrics for a given example and package them into a list to be put into a DataFrame.
-
-    Args:
-        pred_flow: (N,3) Predicted flow vectors.
-        pred_dynamic: (N,) Predicted dynamic labels.
-        gts: (N,3) Ground truth flow vectors.
-        category_indices: (N,) Integer class labels for each point.
-        is_dynamic: (N,) Ground truth dynamic labels.
-        is_close: (N,) True for a point if it is within a 70m x 70m box around the AV.
-        is_valid: (N,) True for a point if its flow vector was successfully computed.
-        metric_categories: A dictionary mapping segmentation labels to groups of category indices.
-
-    Returns:
-        A dictionary of columns to create a long-form DataFrame of the results from.
-        One row for each subset in the breakdown.
-    """
-    metric_categories = FOREGROUND_BACKGROUND_BREAKDOWN
-    pred_flow = pred_flow[is_valid].astype(np.float64)
-    pred_dynamic = pred_dynamic[is_valid].astype(bool)
-    gts = gts[is_valid].astype(np.float64)
-    category_indices = category_indices[is_valid].astype(int)
-    is_dynamic = is_dynamic[is_valid].astype(bool)
-    is_close = is_close[is_valid].astype(bool)
-
-    results: DefaultDict[str, List[Any]] = defaultdict(list)
-
-    # Each metric is broken down by point labels on Object Class, Motion, and Distance from the AV.
-    # We iterate over all combinations of those three categories and compute average metrics on each subset.
-    for cls, category_idxs in metric_categories.items():
-        # Compute the union of all masks within the meta-category.
-        category_mask = category_indices == category_idxs[0]
-        for i in category_idxs[1:]:
-            category_mask = np.logical_or(category_mask, (category_indices == i))
-
-        for motion, m_mask in [("Dynamic", is_dynamic), ("Static", ~is_dynamic)]:
-            for distance, d_mask in [("Close", is_close), ("Far", ~is_close)]:
-                mask = category_mask & m_mask & d_mask
-                subset_size = mask.sum().item()
-                gts_sub = gts[mask]
-                pred_sub = pred_flow[mask]
-                results["Class"] += [cls.value]
-                results["Motion"] += [motion]
-                results["Distance"] += [distance]
-                results["Count"] += [subset_size]
-
-                # Check if there are any points in this subset and if so compute all the average metrics.
-                if subset_size > 0:
-                    for flow_metric_type in SceneFlowMetricType:
-                        results[flow_metric_type] += [
-                            compute_scene_flow_metrics(
-                                pred_sub, gts_sub, flow_metric_type
-                            ).mean()
-                        ]
-                    for seg_metric_type in SegmentationMetricType:
-                        results[seg_metric_type] += [
-                            compute_segmentation_metrics(
-                                pred_dynamic[mask], is_dynamic[mask], seg_metric_type
-                            )
-                        ]
-                else:
-                    for flow_metric_type in SceneFlowMetricType:
-                        results[flow_metric_type] += [np.nan]
-                    for seg_metric_type in SegmentationMetricType:
-                        results[seg_metric_type] += [0.0]
-
-    # reference: eval.py L503: https://github.com/argoverse/av2-api/blob/main/src/av2/evaluation/scene_flow/eval.py
-    # we need Dynamic IoU and EPE 3-Way Average to calculate loss!
-    # weighted: (x[metric_type.value] * x.Count).sum() / total
-    # 'Class': ['Background', 'Background', 'Background', 'Background', 'Foreground', 'Foreground', 'Foreground', 'Foreground']
-    # 'Motion': ['Dynamic', 'Dynamic', 'Static', 'Static', 'Dynamic', 'Dynamic', 'Static', 'Static']
-    # 'Distance': ['Close', 'Far', 'Close', 'Far', 'Close', 'Far', 'Close', 'Far']
-    
-    EPE_Background_Static = compute_epe(results, [2, 3])
-    EPE_Dynamic = compute_epe(results, [4, 5])
-    EPE_Foreground_Static = compute_epe(results, [6, 7])
-    Dynamic_IoU = sum(results['TP']) / (sum(results['TP']) + sum(results['FP']) + sum(results['FN'])+EPS)
-
-    return {
-        'EPE_BS': EPE_Background_Static, 
-        'EPE_FD': EPE_Dynamic, 
-        'EPE_FS': EPE_Foreground_Static, 
-        'IoU': Dynamic_IoU
-    }
-
-
-def evaluate_predictions(
-    annotations_dir: Path, get_prediction: Callable[[Path], pd.DataFrame]
-) -> pd.DataFrame:
-    """Run the evaluation on predictions and labels.
-
-    Args:
-        annotations_dir: Path to the directory containing the annotation files produced by `make_annotation_files.py`.
-        get_prediction: Function that retrieves a predictions DataFrame for a given relative
-                        annotation filepath, or None if no prediction exists.
-
-    Returns:
-        DataFrame containing the average metrics on each subset of each example.
-    """
-    results: DefaultDict[str, List[Any]] = defaultdict(list)
-    annotation_files = sorted(annotations_dir.rglob("*.feather"))
-    for anno_file in track(annotation_files, description="Evaluating..."):
-        gts = pd.read_feather(anno_file)
-        name: Path = anno_file.relative_to(annotations_dir)
-        pred = get_prediction(name)
-        if pred is None:
-            continue
-        current_example_results = compute_metrics(
-            pred[list(FLOW_COLUMNS)].to_numpy().astype(float),
-            pred["is_dynamic"].to_numpy().astype(bool),
-            gts[list(FLOW_COLUMNS)].to_numpy().astype(float),
-            gts["category_indices"].to_numpy().astype(np.uint8),
-            gts["is_dynamic"].to_numpy().astype(bool),
-            gts["is_close"].to_numpy().astype(bool),
-            gts["is_valid"].to_numpy().astype(bool),
-            FOREGROUND_BACKGROUND_BREAKDOWN,
-        )
-        num_subsets = len(list(current_example_results.values())[0])
-        results["Example"] += [str(name) for _ in range(num_subsets)]
-        for m in current_example_results:
-            results[m] += current_example_results[m]
-    df = pd.DataFrame(
-        results,
-        columns=["Example", "Class", "Motion", "Distance", "Count"]
-        + list(SceneFlowMetricType)
-        + list(SegmentationMetricType),
-    )
-    return df
-
-
-def get_prediction_from_directory(
-    annotation_name: Path, predictions_dir: Path
-) -> Optional[pd.DataFrame]:
-    """Get the prediction corresponding annotation from a directory of prediction files.
-
-    Args:
-        annotation_name: Relative path to the annotation file.
-        predictions_dir: Path to the predicition files in submission_format.
-
-    Returns:
-        DataFrame containing the predictions for that annotation file or None if it does not exist.
-    """
-    pred_file = predictions_dir / annotation_name
-    if not pred_file.exists():
-        return None
-    pred = pd.read_feather(pred_file)
-    return pred
-
-
-def get_prediction_from_zipfile(
-    annotation_name: Path, predictions_zip: Path
-) -> Optional[pd.DataFrame]:
-    """Get the prediction corresponding annotation from a zip archive of prediction files.
-
-    Args:
-        annotation_name: Relative path to the annotation file.
-        predictions_zip: Path to the prediction files in a zip archive.
-
-    Returns:
-        DataFrame containing the predictions for that annotation file or None if it does not exist.
-    """
-    with ZipFile(predictions_zip, "r") as zf:
-        name = annotation_name.as_posix()
-        path = zipfile.Path(zf, name)
-        if path.exists():
-            return pd.read_feather(zf.open(name))
-        else:
-            return None
-
-def evaluate_directories(annotations_dir: Path, predictions_dir: Path) -> pd.DataFrame:
-    """Run the evaluation on predictions and labels saved to disk.
-
-    Args:
-        annotations_dir: Path to the directory containing the annotation files produced by `make_annotation_files.py`.
-        predictions_dir: Path to the prediction files in submission format.
-
-    Returns:
-        DataFrame containing the average metrics on each subset of each example.
-    """
-    return evaluate_predictions(
-        annotations_dir, lambda n: get_prediction_from_directory(n, predictions_dir)
-    )
-
-def evaluate_zip(annotations_dir: Path, predictions_zip: Path) -> pd.DataFrame:
-    """Run the evaluation on predictions and labels saved to disk.
-
-    Args:
-        annotations_dir: Path to the directory containing the annotation files produced by `make_annotation_files.py`.
-        predictions_zip: Path to the prediction files in a zip archive.
-
-    Returns:
-        DataFrame containing the average metrics on each subset of each example.
-    """
-    return evaluate_predictions(
-        annotations_dir, lambda n: get_prediction_from_zipfile(n, predictions_zip)
-    )
-
-def results_to_dict(frame: pd.DataFrame) -> Dict[str, float]:
-    """Convert a results DataFrame to a dictionary of whole dataset metrics.
-
-    Args:
-        frame: DataFrame returned by evaluate_directories.
-
-    Returns:
-        Dictionary string keys "<Motion/Class/Distance/Metric>" mapped to average metrics on that subset.
-    """
-    output = {}
-    grouped = frame.groupby(["Class", "Motion", "Distance"])
-
-    def weighted_average(
-        x: pd.DataFrame, metric_type: Union[SceneFlowMetricType, SegmentationMetricType]
-    ) -> float:
-        """Weighted average of metric m using the Count column.
-
-        Args:
-            x: Input data-frame.
-            metric_type: Metric type.
-
-        Returns:
-            Weighted average over the metric_type;
-        """
-        total = cast(int, x["Count"].sum())
-        if total == 0:
-            return np.nan
-        averages: float = (x[metric_type.value] * x.Count).sum() / total
-        return averages
-
-    for metric_type in SceneFlowMetricType:
-        avg: pd.Series[float] = grouped.apply(
-            lambda x, m=metric_type: weighted_average(x, metric_type=m)
-        )
-        segments: List[Tuple[str, str, str]] = avg.index.to_list()
-        for segment in segments:
-            if segment[:2] == NO_FMT_INDICES:
-                continue
-
-            metric_type_str = (
-                metric_type.title().replace("_", " ")
-                if metric_type != SceneFlowMetricType.EPE
-                else metric_type
-            )
-            name = metric_type_str + "/" + "/".join([str(i) for i in segment])
-            output[name] = avg.loc[segment]
-
-    grouped = frame.groupby(["Class", "Motion"])
-    for metric_type in SceneFlowMetricType:
-        avg_nodist: pd.Series[float] = grouped.apply(
-            lambda x, m=metric_type: weighted_average(x, metric_type=m)
-        )
-        segments_nodist: List[Tuple[str, str, str]] = avg_nodist.index.to_list()
-        for segment in segments_nodist:
-            if segment[:2] == NO_FMT_INDICES:
-                continue
-
-            metric_type_str = (
-                metric_type.title().replace("_", " ")
-                if metric_type != SceneFlowMetricType.EPE
-                else metric_type
-            )
-            name = metric_type_str + "/" + "/".join([str(i) for i in segment])
-            output[name] = avg_nodist.loc[segment]
-    output["Dynamic IoU"] = frame.TP.sum() / (
-        frame.TP.sum() + frame.FP.sum() + frame.FN.sum()
-    )
-    output["EPE 3-Way Average"] = (
-        output["EPE/Foreground/Dynamic"]
-        + output["EPE/Foreground/Static"]
-        + output["EPE/Background/Static"]
-    ) / 3
-    return output
-
-def evaluate(annotations_dir: str, predictions_dir: str) -> Dict[str, float]:
-    """Evaluate a set of predictions and print the results.
-
-    Args:
-        annotations_dir: Path to the directory containing the annotation files produced by `make_annotation_files.py`.
-        predictions_dir: Path to the prediction files in submission format.
-
-    Returns:
-        The results as a dict of metric names and values.
-    """
-    results_df = evaluate_directories(Path(annotations_dir), Path(predictions_dir))
-    results_dict = results_to_dict(results_df)
-
-    for metric in sorted(results_dict):
-        print(f"{metric}: {results_dict[metric]:.3f}")
-
-    return results_dict
-
-def write_output_file(
-    flow: NDArrayFloat,
-    is_dynamic: NDArrayBool,
-    sweep_uuid: Tuple[str, int],
-    output_dir: Path,
-    leaderboard_version: int = 1,
-) -> None:
-    """Write an output predictions file in the correct format for submission.
-
-    Args:
-        flow: (N,3) Flow predictions.
-        is_dynamic: (N,) Dynamic segmentation prediction.
-        sweep_uuid: Identifier of the sweep being predicted (log_id, timestamp_ns).
-        output_dir: Top level directory containing all predictions.
-        leaderboard_version: Version of the leaderboard format to use.
-            version 1 for: https://eval.ai/web/challenges/challenge-page/2010/evaluation
-            version 2 for: https://eval.ai/web/challenges/challenge-page/2210/evaluation
-    """
-    output_log_dir = output_dir / sweep_uuid[0]
-    output_log_dir.mkdir(exist_ok=True, parents=True)
-    fx_m = flow[:, 0].astype(np.float16)
-    fy_m = flow[:, 1].astype(np.float16)
-    fz_m = flow[:, 2].astype(np.float16)
-    if leaderboard_version == 1:
-
-        output = pd.DataFrame(
-            {
-                "flow_tx_m": fx_m,
-                "flow_ty_m": fy_m,
-                "flow_tz_m": fz_m,
-                "is_dynamic": is_dynamic.astype(bool),
-            }
-        )
-        output.to_feather(output_log_dir / f"{sweep_uuid[1]}.feather")
-    elif leaderboard_version == 2:
-        output = pd.DataFrame(
-            {
-                "is_valid": np.ones_like(fx_m, dtype=bool),
-                "flow_tx_m": fx_m,
-                "flow_ty_m": fy_m,
-                "flow_tz_m": fz_m,
-            }
-        )
-        output.to_feather(output_log_dir / f"{sweep_uuid[1]}.feather")
-    
-from zipfile import ZipFile
-from torch import BoolTensor
-import torch
-def get_eval_point_mask(sweep_uuid: Tuple[str, int], mask_file: Path) -> BoolTensor:
-    """Retrieve for a given sweep, a boolean mask indicating which points are evaluated on.
-
-    Args:
-        sweep_uuid: The uuid of the first sweep in the pair to retrieve the mask for.
-        mask_file: Archive of submission masks.
-
-    Returns:
-        The submission mask for that pair.
-    """
-    with ZipFile(mask_file) as masks:
-        log_id, timestamp_ns = sweep_uuid
-        mask = (
-            pd.read_feather(masks.open(f"{log_id}/{timestamp_ns}.feather"))
-            .to_numpy()
-            .astype(bool)
-        )
-
-    return BoolTensor(torch.from_numpy(mask).squeeze())
-
-
-# python >= 3.7
-from dataclasses import dataclass
-@dataclass(frozen=True, eq=True, repr=True)
-class BaseSplitValue:
-    name: str
-    avg_epe: float
-    avg_speed: float
-    speed_thresholds: Tuple[float, float]
-    count: int
-    def __eq__(self, __value: object) -> bool:
-        return hash(self) == hash(__value)
-    
-def compute_bucketed_epe(
-    pred_flow: NDArrayFloat,
-    gt_flow: NDArrayFloat,
-    category_indices: NDArrayInt,
-    is_valid: NDArrayBool,
-):
-    storage_error_matrix = []
-    # bucket_max_speed, num_buckets, distance_thresholds set is from: eval/bucketed_epe.py#L226
-    bucket_edges = np.concatenate([np.linspace(0, 2.0, 51), [np.inf]])
-    speed_thresholds = list(zip(bucket_edges, bucket_edges[1:]))
-
-    gt_speeds = np.linalg.norm(gt_flow, axis=-1)
-    error_flow = np.linalg.norm(pred_flow - gt_flow, axis=-1)
-    # based on each category, compute the epe
-    for cats_name in BUCKETED_METACATAGORIES:
-        selected_classes_ids = [CATEGORY_TO_INDEX[cat] for cat in BUCKETED_METACATAGORIES[cats_name]]
-        cat_mask = np.isin(category_indices, np.array(selected_classes_ids))
-        # since background don't have speed, we just compute the average epe
-        if cats_name == "BACKGROUND":
-            mask = cat_mask & is_valid
-            storage_error_matrix.append(BaseSplitValue(cats_name, error_flow[mask].mean(), gt_speeds[mask].mean(), (0.0, 0.04), mask.sum()))
-            continue
-        for min_speed_threshold, max_speed_threshold in speed_thresholds:
-            speed_mask = (gt_speeds >= min_speed_threshold) & (gt_speeds < max_speed_threshold)
-            mask = cat_mask & speed_mask & is_valid
-            count_pts = mask.sum()
-            if count_pts == 0:
-                continue
-            avg_epe = error_flow[mask].mean()
-            avg_speed = gt_speeds[mask].mean()
-            storage_error_matrix.append(BaseSplitValue(cats_name, avg_epe, avg_speed, (min_speed_threshold, max_speed_threshold), count_pts))
-            
-    return storage_error_matrix
diff --git a/src/utils/eval_metric.py b/src/utils/eval_metric.py
deleted file mode 100644
index 246794b..0000000
--- a/src/utils/eval_metric.py
+++ /dev/null
@@ -1,272 +0,0 @@
-"""
-# 
-# Created: 2024-04-14 11:57
-# Copyright (C) 2024-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-# 
-#
-# Reference to official evaluation scripts:
-# - EPE Threeway: https://github.com/argoverse/av2-api/blob/main/src/av2/evaluation/scene_flow/eval.py
-# - Bucketed EPE: https://github.com/kylevedder/BucketedSceneFlowEval/blob/master/bucketed_scene_flow_eval/eval/bucketed_epe.py
-"""
-
-import torch
-import os, sys
-import numpy as np
-from typing import Dict, Final, List, Tuple
-from tabulate import tabulate
-
-BASE_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '../..' ))
-sys.path.append(BASE_DIR)
-from src.utils.av2_eval import compute_metrics, compute_bucketed_epe, CLOSE_DISTANCE_THRESHOLD
-
-
-# EPE Three-way: Foreground Dynamic, Background Dynamic, Background Static
-# leaderboard link: https://eval.ai/web/challenges/challenge-page/2010/evaluation
-def evaluate_leaderboard(est_flow, rigid_flow, pc0, gt_flow, is_valid, pts_ids):
-    gt_is_dynamic = torch.linalg.vector_norm(gt_flow - rigid_flow, dim=-1) >= 0.05
-    mask_ = ~est_flow.isnan().any(dim=1) & ~rigid_flow.isnan().any(dim=1) & ~pc0[:, :3].isnan().any(dim=1) & ~gt_flow.isnan().any(dim=1)
-    mask_no_nan = mask_ & ~gt_is_dynamic.isnan() & ~is_valid.isnan() & ~pts_ids.isnan()
-    est_flow = est_flow[mask_no_nan, :]
-    rigid_flow = rigid_flow[mask_no_nan, :]
-    pc0 = pc0[mask_no_nan, :]
-    gt_flow = gt_flow[mask_no_nan, :]
-    gt_is_dynamic = gt_is_dynamic[mask_no_nan]
-    is_valid = is_valid[mask_no_nan]
-    pts_ids = pts_ids[mask_no_nan]
-
-    est_is_dynamic = torch.linalg.vector_norm(est_flow - rigid_flow, dim=-1) >= 0.05
-    is_close = torch.all(torch.abs(pc0[:, :2]) <= CLOSE_DISTANCE_THRESHOLD, dim=1)
-    res_dict = compute_metrics(
-        est_flow.detach().cpu().numpy().astype(float),
-        est_is_dynamic.detach().cpu().numpy().astype(bool),
-        gt_flow.detach().cpu().numpy().astype(float),
-        pts_ids.detach().cpu().numpy().astype(np.uint8),
-        gt_is_dynamic.detach().cpu().numpy().astype(bool),
-        is_close.detach().cpu().numpy().astype(bool),
-        is_valid.detach().cpu().numpy().astype(bool)
-    )
-    return res_dict
-
-# EPE Bucketed: BACKGROUND, CAR, PEDESTRIAN, WHEELED_VRU, OTHER_VEHICLES
-def evaluate_leaderboard_v2(est_flow, rigid_flow, pc0, gt_flow, is_valid, pts_ids):
-    # in x,y dis, ref to official evaluation: eval/base_per_frame_sceneflow_eval.py#L118-L119
-    pc_distance = torch.linalg.vector_norm(pc0[:, :2], dim=-1)
-    distance_mask = pc_distance <= CLOSE_DISTANCE_THRESHOLD
-
-    mask_flow_non_nan = ~est_flow.isnan().any(dim=1) & ~rigid_flow.isnan().any(dim=1) & ~pc0[:, :3].isnan().any(dim=1) & ~gt_flow.isnan().any(dim=1)
-    mask_eval = mask_flow_non_nan & ~is_valid.isnan() & ~pts_ids.isnan() & distance_mask
-    rigid_flow = rigid_flow[mask_eval, :]
-    est_flow = est_flow[mask_eval, :] - rigid_flow
-    gt_flow = gt_flow[mask_eval, :] - rigid_flow # in v2 evaluation, we don't add rigid flow to evaluate
-    is_valid = is_valid[mask_eval]
-    pts_ids = pts_ids[mask_eval]
-
-    res_dict = compute_bucketed_epe(
-        est_flow.detach().cpu().numpy().astype(float),
-        gt_flow.detach().cpu().numpy().astype(float),
-        pts_ids.detach().cpu().numpy().astype(np.uint8),
-        is_valid.detach().cpu().numpy().astype(bool),
-    )
-    return res_dict
-
-# reference to official evaluation: bucketed_scene_flow_eval/eval/bucketed_epe.py
-# python >= 3.7
-from dataclasses import dataclass
-import warnings
-@dataclass(frozen=True, eq=True, repr=True)
-class OverallError:
-    static_epe: float
-    dynamic_error: float
-
-    def __repr__(self) -> str:
-        static_epe_val_str = (
-            f"{self.static_epe:0.6f}" if np.isfinite(self.static_epe) else f"{self.static_epe}"
-        )
-        dynamic_error_val_str = (
-            f"{self.dynamic_error:0.6f}"
-            if np.isfinite(self.dynamic_error)
-            else f"{self.dynamic_error}"
-        )
-        return f"({static_epe_val_str}, {dynamic_error_val_str})"
-
-    def to_tuple(self) -> Tuple[float, float]:
-        return (self.static_epe, self.dynamic_error)
-
-class BucketResultMatrix:
-    def __init__(self, class_names: List[str], speed_buckets: List[Tuple[float, float]]):
-        self.class_names = class_names
-        self.speed_buckets = speed_buckets
-
-        assert (
-            len(self.class_names) > 0
-        ), f"class_names must have at least one entry, got {len(self.class_names)}"
-        assert (
-            len(self.speed_buckets) > 0
-        ), f"speed_buckets must have at least one entry, got {len(self.speed_buckets)}"
-
-        # By default, NaNs are not counted in np.nanmean
-        self.epe_storage_matrix = np.zeros((len(class_names), len(self.speed_buckets))) * np.NaN
-        self.speed_storage_matrix = np.zeros((len(class_names), len(self.speed_buckets))) * np.NaN
-        self.count_storage_matrix = np.zeros(
-            (len(class_names), len(self.speed_buckets)), dtype=np.int64
-        )
-
-    def accumulate_value(
-        self,
-        class_name: str,
-        speed_bucket: Tuple[float, float],
-        average_epe: float,
-        average_speed: float,
-        count: int,
-    ):
-        assert count > 0, f"count must be greater than 0, got {count}"
-        assert np.isfinite(average_epe), f"average_epe must be finite, got {average_epe}"
-        assert np.isfinite(average_speed), f"average_speed must be finite, got {average_speed}"
-
-        class_idx = self.class_names.index(class_name)
-        speed_bucket_idx = self.speed_buckets.index(speed_bucket)
-
-        prior_epe = self.epe_storage_matrix[class_idx, speed_bucket_idx]
-        prior_speed = self.speed_storage_matrix[class_idx, speed_bucket_idx]
-        prior_count = self.count_storage_matrix[class_idx, speed_bucket_idx]
-
-        if np.isnan(prior_epe):
-            self.epe_storage_matrix[class_idx, speed_bucket_idx] = average_epe
-            self.speed_storage_matrix[class_idx, speed_bucket_idx] = average_speed
-            self.count_storage_matrix[class_idx, speed_bucket_idx] = count
-            return
-
-        # Accumulate the average EPE and speed, weighted by the number of samples using np.mean
-        self.epe_storage_matrix[class_idx, speed_bucket_idx] = np.average(
-            [prior_epe, average_epe], weights=[prior_count, count]
-        )
-        self.speed_storage_matrix[class_idx, speed_bucket_idx] = np.average(
-            [prior_speed, average_speed], weights=[prior_count, count]
-        )
-        self.count_storage_matrix[class_idx, speed_bucket_idx] += count
-
-    def get_normalized_error_matrix(self) -> np.ndarray:
-        error_matrix = self.epe_storage_matrix.copy()
-        # For the 1: columns, normalize EPE entries by the speed
-        error_matrix[:, 1:] = error_matrix[:, 1:] / self.speed_storage_matrix[:, 1:]
-        return error_matrix
-
-    def get_overall_class_errors(self, normalized: bool = True):
-        #  -> dict[str, OverallError]
-        if normalized:
-            error_matrix = self.get_normalized_error_matrix()
-        else:
-            error_matrix = self.epe_storage_matrix.copy()
-        static_epes = error_matrix[:, 0]
-        # Hide the warning about mean of empty slice
-        # I expect to see RuntimeWarnings in this block
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", category=RuntimeWarning)
-            dynamic_errors = np.nanmean(error_matrix[:, 1:], axis=1)
-
-        return {
-            class_name: OverallError(static_epe, dynamic_error)
-            for class_name, static_epe, dynamic_error in zip(
-                self.class_names, static_epes, dynamic_errors
-            )
-        }
-
-    def get_class_entries(self, class_name: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        class_idx = self.class_names.index(class_name)
-
-        epe = self.epe_storage_matrix[class_idx, :]
-        speed = self.speed_storage_matrix[class_idx, :]
-        count = self.count_storage_matrix[class_idx, :]
-        return epe, speed, count
-
-    def get_mean_average_values(self, normalized: bool = True) -> OverallError:
-        overall_errors = self.get_overall_class_errors(normalized=normalized)
-
-        average_static_epe = np.nanmean([v.static_epe for v in overall_errors.values()])
-        average_dynamic_error = np.nanmean([v.dynamic_error for v in overall_errors.values()])
-
-        return OverallError(average_static_epe, average_dynamic_error)
-
-class OfficialMetrics:
-    def __init__(self):
-        # same with BUCKETED_METACATAGORIES
-        self.bucketed= {
-            'BACKGROUND': {'Static': [], 'Dynamic': []},
-            'CAR': {'Static': [], 'Dynamic': []},
-            'OTHER_VEHICLES': {'Static': [], 'Dynamic': []},
-            'PEDESTRIAN': {'Static': [], 'Dynamic': []},
-            'WHEELED_VRU': {'Static': [], 'Dynamic': []},
-            'Mean': {'Static': [], 'Dynamic': []}
-        }
-
-        self.epe_3way = {
-            'EPE_FD': [],
-            'EPE_BS': [],
-            'EPE_FS': [],
-            'IoU': [],
-            'Three-way': []
-        }
-
-        self.norm_flag = False
-
-
-        # bucket_max_speed, num_buckets, distance_thresholds set is from: eval/bucketed_epe.py#L226
-        bucket_edges = np.concatenate([np.linspace(0, 2.0, 51), [np.inf]])
-        speed_thresholds = list(zip(bucket_edges, bucket_edges[1:]))
-        self.bucketedMatrix = BucketResultMatrix(
-            class_names=['BACKGROUND', 'CAR', 'OTHER_VEHICLES', 'PEDESTRIAN', 'WHEELED_VRU'],
-            speed_buckets=speed_thresholds
-        )
-    def step(self, epe_dict, bucket_dict):
-        """
-        This step function is used to store the results of **each frame**.
-        """
-        for key in epe_dict:
-            self.epe_3way[key].append(epe_dict[key])
-
-        for item_ in bucket_dict:
-            category_name = item_.name
-            speed_tuple = item_.speed_thresholds
-            self.bucketedMatrix.accumulate_value(
-                category_name,
-                speed_tuple,
-                item_.avg_epe,
-                item_.avg_speed,
-                item_.count,
-            )
-
-    def normalize(self):
-        """
-        This normalize mean average results between **frame and frame**.
-        """
-        for key in self.epe_3way:
-            self.epe_3way[key] = np.mean(self.epe_3way[key])
-        self.epe_3way['Three-way'] = np.mean([self.epe_3way['EPE_FD'], self.epe_3way['EPE_BS'], self.epe_3way['EPE_FS']])
-
-        mean = self.bucketedMatrix.get_mean_average_values(normalized=True).to_tuple()
-        class_errors = self.bucketedMatrix.get_overall_class_errors(normalized=True)
-        for key in self.bucketed:
-            if key == 'Mean':
-                self.bucketed[key]['Static'] = mean[0]
-                self.bucketed[key]['Dynamic'] = mean[1]
-                continue
-            for i, sub_key in enumerate(self.bucketed[key]):
-                self.bucketed[key][sub_key] = class_errors[key].to_tuple()[i] # 0: static, 1: dynamic
-        self.norm_flag = True
-    
-    def print(self):
-        if not self.norm_flag:
-            self.normalize()
-        printed_data = []
-        for key in self.epe_3way:
-            printed_data.append([key,self.epe_3way[key]])
-        print("Version 1 Metric on EPE Three-way:")
-        print(tabulate(printed_data), "\n")
-
-        printed_data = []
-        for key in self.bucketed:
-            printed_data.append([key, self.bucketed[key]['Static'], self.bucketed[key]['Dynamic']])
-        print("Version 2 Metric on Category-based:")
-        print(tabulate(printed_data, headers=["Class", "Static", "Dynamic"], tablefmt='orgtbl'), "\n")
-    
\ No newline at end of file
diff --git a/src/utils/mics.py b/src/utils/mics.py
deleted file mode 100644
index 4bff972..0000000
--- a/src/utils/mics.py
+++ /dev/null
@@ -1,344 +0,0 @@
-import numpy as np
-from itertools import accumulate
-from collections import namedtuple
-from typing import Optional
-
-import torch.nn.init as init
-import torch
-
-import pickle, h5py, os
-
-# ref: https://github.com/Lightning-AI/lightning/issues/924#issuecomment-1670917204
-def weights_update(model, checkpoint):
-    print("Loading pretrained weights from epoch:", checkpoint['epoch']+1)
-    model_dict = model.state_dict()
-
-    pretrained_dict = {}
-    for k, v in checkpoint['state_dict'].items():
-        k = k.replace("model.", "")
-        if k in model_dict:
-            pretrained_dict[k] = v
-
-    model_dict.update(pretrained_dict)
-    model.load_state_dict(model_dict)
-    return model
-
-class EarlyStopping(object):
-    def __init__(self, mode='min', min_delta=0, patience=10, percentage=False):
-        self.mode = mode
-        self.min_delta = min_delta
-        self.patience = patience
-        self.best = None
-        self.num_bad_epochs = 0
-        self.is_better = None
-        self._init_is_better(mode, min_delta, percentage)
-
-        if patience == 0:
-            self.is_better = lambda a, b: True
-            self.step = lambda a: False
-
-    def step(self, metrics):
-        if self.best is None:
-            self.best = metrics
-            return False
-
-        if torch.isnan(metrics):
-            return True
-
-        if self.is_better(metrics, self.best):
-            self.num_bad_epochs = 0
-            self.best = metrics
-        else:
-            self.num_bad_epochs += 1
-
-        if self.num_bad_epochs >= self.patience:
-            return True
-
-        return False
-
-    def _init_is_better(self, mode, min_delta, percentage):
-        if mode not in {'min', 'max'}:
-            raise ValueError('mode ' + mode + ' is unknown!')
-        if not percentage:
-            if mode == 'min':
-                self.is_better = lambda a, best: a < best - min_delta
-            if mode == 'max':
-                self.is_better = lambda a, best: a > best + min_delta
-        else:
-            if mode == 'min':
-                self.is_better = lambda a, best: a < best - (
-                            best * min_delta / 100)
-            if mode == 'max':
-                self.is_better = lambda a, best: a > best + (
-                            best * min_delta / 100)
-                
-def setup_multi_gpu(if_multi_gpu: bool):
-    n_gpu = torch.cuda.device_count()
-    if n_gpu>1 and if_multi_gpu:
-        torch.distributed.init_process_group(backend='nccl')
-    else:
-        if_multi_gpu = False
-        n_gpu = 1
-    assert n_gpu, "Can't find any GPU device on this machine."
-
-    return n_gpu
-
-def init_weights(m) -> None:
-    """
-    Apply the weight initialization to a single layer.
-    Use this with your_module.apply(init_weights).
-    The single layer is a module that has the weights parameter. This does not yield for all modules.
-    :param m: the layer to apply the init to
-    :return: None
-    """
-    if type(m) in [torch.nn.Linear, torch.nn.Conv2d]:
-        # Note: There is also xavier_normal_ but the paper does not state which one they used.
-        torch.nn.init.xavier_uniform_(m.weight)
-
-def weights_init(m):
-    if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):
-        init.xavier_uniform_(m.weight.data)
-        if m.bias is not None:
-            init.constant_(m.bias.data, 0.0)
-    elif isinstance(m, torch.nn.BatchNorm2d):
-        init.constant_(m.weight.data, 1.0)
-        init.constant_(m.bias.data, 0.0)
-    
-def ground_range_filter(av2_sweep, range_lim=50):
-    pointcloud = av2_sweep.lidar.as_tensor().numpy()[:,:3]
-    ground_mask = ~av2_sweep.is_ground.numpy()
-    range_mask = np.where((pointcloud[:,0] > -range_lim) & (pointcloud[:,0] < range_lim) & 
-                        (pointcloud[:,1] > -range_lim) & (pointcloud[:,1] < range_lim) & 
-                        (pointcloud[:,2] > -range_lim) & (pointcloud[:,2] < range_lim))
-    return pointcloud[ground_mask&range_mask]
-
-def ground_range_mask(av2_sweep, range_lim=50):
-    pointcloud = av2_sweep.lidar.as_tensor().numpy()[:,:3]
-    ground_mask = ~av2_sweep.is_ground.numpy()
-    range_mask = ((pointcloud[:,0] > -range_lim) & (pointcloud[:,0] < range_lim) & 
-                (pointcloud[:,1] > -range_lim) & (pointcloud[:,1] < range_lim) & 
-                (pointcloud[:,2] > -range_lim) & (pointcloud[:,2] < range_lim))
-    return ground_mask&range_mask
-
-def ground_range_tmask(pointcloud, ground, range_lim=50):
-    pointcloud = pointcloud.numpy()[:,:3]
-    ground_mask = ~ground.numpy()
-    range_mask = ((pointcloud[:,0] > -range_lim) & (pointcloud[:,0] < range_lim) & 
-                (pointcloud[:,1] > -range_lim) & (pointcloud[:,1] < range_lim) & 
-                (pointcloud[:,2] > -range_lim) & (pointcloud[:,2] < range_lim))
-    return ground_mask&range_mask
-
-# ====> import func through string, ref: https://stackoverflow.com/a/19393328
-import importlib
-def import_func(path: str):
-    function_string = path
-    mod_name, func_name = function_string.rsplit('.',1)
-    mod = importlib.import_module(mod_name)
-    func = getattr(mod, func_name)
-    return func
-
-
-# plotting from FastNSF: 
-
-# ANCHOR: visualization as in the paper
-DEFAULT_TRANSITIONS = (15, 6, 4, 11, 13, 6)
-
-def make_colorwheel(transitions: tuple=DEFAULT_TRANSITIONS) -> np.ndarray:
-    """Creates a colorwheel (borrowed/modified from flowpy).
-    A colorwheel defines the transitions between the six primary hues:
-    Red(255, 0, 0), Yellow(255, 255, 0), Green(0, 255, 0), Cyan(0, 255, 255), Blue(0, 0, 255) and Magenta(255, 0, 255).
-    Args:
-        transitions: Contains the length of the six transitions, based on human color perception.
-    Returns:
-        colorwheel: The RGB values of the transitions in the color space.
-    Notes:
-        For more information, see:
-        https://web.archive.org/web/20051107102013/http://members.shaw.ca/quadibloc/other/colint.htm
-        http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
-    """
-    colorwheel_length = sum(transitions)
-    # The red hue is repeated to make the colorwheel cyclic
-    base_hues = map(
-        np.array, ([255, 0, 0], [255, 255, 0], [0, 255, 0], [0, 255, 255], [0, 0, 255], [255, 0, 255], [255, 0, 0])
-    )
-    colorwheel = np.zeros((colorwheel_length, 3), dtype="uint8")
-    hue_from = next(base_hues)
-    start_index = 0
-    for hue_to, end_index in zip(base_hues, accumulate(transitions)):
-        transition_length = end_index - start_index
-        colorwheel[start_index:end_index] = np.linspace(hue_from, hue_to, transition_length, endpoint=False)
-        hue_from = hue_to
-        start_index = end_index
-    return colorwheel
-
-
-def flow_to_rgb(
-    flow: np.ndarray,
-    flow_max_radius: Optional[float]=None,
-    background: Optional[str]="bright",
-) -> np.ndarray:
-    """Creates a RGB representation of an optical flow (borrowed/modified from flowpy).
-    Args:
-        flow: scene flow.
-            flow[..., 0] should be the x-displacement
-            flow[..., 1] should be the y-displacement
-            flow[..., 2] should be the z-displacement
-        flow_max_radius: Set the radius that gives the maximum color intensity, useful for comparing different flows.
-            Default: The normalization is based on the input flow maximum radius.
-        background: States if zero-valued flow should look 'bright' or 'dark'.
-    Returns: An array of RGB colors.
-    """
-    valid_backgrounds = ("bright", "dark")
-    if background not in valid_backgrounds:
-        raise ValueError(f"background should be one the following: {valid_backgrounds}, not {background}.")
-    wheel = make_colorwheel()
-    # For scene flow, it's reasonable to assume displacements in x and y directions only for visualization pursposes.
-    complex_flow = flow[..., 0] + 1j * flow[..., 1]
-    radius, angle = np.abs(complex_flow), np.angle(complex_flow)
-    if flow_max_radius is None:
-        flow_max_radius = np.max(radius)
-    if flow_max_radius > 0:
-        radius /= flow_max_radius
-    ncols = len(wheel)
-    # Map the angles from (-pi, pi] to [0, 2pi) to [0, ncols - 1)
-    angle[angle < 0] += 2 * np.pi
-    angle = angle * ((ncols - 1) / (2 * np.pi))
-    # Make the wheel cyclic for interpolation
-    wheel = np.vstack((wheel, wheel[0]))
-    # Interpolate the hues
-    (angle_fractional, angle_floor), angle_ceil = np.modf(angle), np.ceil(angle)
-    angle_fractional = angle_fractional.reshape((angle_fractional.shape) + (1,))
-    float_hue = (
-        wheel[angle_floor.astype(np.int32)] * (1 - angle_fractional) + wheel[angle_ceil.astype(np.int32)] * angle_fractional
-    )
-    ColorizationArgs = namedtuple(
-        'ColorizationArgs', ['move_hue_valid_radius', 'move_hue_oversized_radius', 'invalid_color']
-    )
-    def move_hue_on_V_axis(hues, factors):
-        return hues * np.expand_dims(factors, -1)
-    def move_hue_on_S_axis(hues, factors):
-        return 255. - np.expand_dims(factors, -1) * (255. - hues)
-    if background == "dark":
-        parameters = ColorizationArgs(
-            move_hue_on_V_axis, move_hue_on_S_axis, np.array([255, 255, 255], dtype=np.float64)
-        )
-    else:
-        parameters = ColorizationArgs(move_hue_on_S_axis, move_hue_on_V_axis, np.array([0, 0, 0], dtype=np.float64))
-    colors = parameters.move_hue_valid_radius(float_hue, radius)
-    oversized_radius_mask = radius > 1
-    colors[oversized_radius_mask] = parameters.move_hue_oversized_radius(
-        float_hue[oversized_radius_mask],
-        1 / radius[oversized_radius_mask]
-    )
-    return colors.astype(np.uint8)
-
-
-class HDF5Data:
-    def __init__(self, directory, flow_view=False, vis_name="flow"):
-        '''
-        directory: the directory of the dataset
-        t_x: how many past frames we want to extract
-        '''
-        self.flow_view = flow_view
-        self.vis_name = vis_name
-        self.directory = directory
-        with open(os.path.join(self.directory, 'index_total.pkl'), 'rb') as f:
-            self.data_index = pickle.load(f)
-
-        self.scene_id_bounds = {}  # 存储每个scene_id的最大最小timestamp和位置
-        for idx, (scene_id, timestamp) in enumerate(self.data_index):
-            if scene_id not in self.scene_id_bounds:
-                self.scene_id_bounds[scene_id] = {
-                    "min_timestamp": timestamp,
-                    "max_timestamp": timestamp,
-                    "min_index": idx,
-                    "max_index": idx
-                }
-            else:
-                bounds = self.scene_id_bounds[scene_id]
-                # 更新最小timestamp和位置
-                if timestamp < bounds["min_timestamp"]:
-                    bounds["min_timestamp"] = timestamp
-                    bounds["min_index"] = idx
-                # 更新最大timestamp和位置
-                if timestamp > bounds["max_timestamp"]:
-                    bounds["max_timestamp"] = timestamp
-                    bounds["max_index"] = idx
-
-    def __len__(self):
-        return len(self.data_index)
-    
-    def __getitem__(self, index):
-        scene_id, timestamp = self.data_index[index]
-        # to make sure we have continuous frames for flow view
-        if self.flow_view and self.scene_id_bounds[scene_id]["max_index"] == index:
-            index = index - 1
-            scene_id, timestamp = self.data_index[index]
-
-        key = str(timestamp)
-        data_dict = {
-            'scene_id': scene_id,
-            'timestamp': timestamp,
-        }
-        with h5py.File(os.path.join(self.directory, f'{scene_id}.h5'), 'r') as f:
-            # original data
-            data_dict['pc0'] = f[key]['lidar'][:]
-            data_dict['gm0'] = f[key]['ground_mask'][:]
-            data_dict['pose0'] = f[key]['pose'][:]
-            for flow_key in [self.vis_name, 'dufo_label', 'label']:
-                if flow_key in f[key]:
-                    data_dict[flow_key] = f[key][flow_key][:]
-
-            if self.flow_view:
-                next_timestamp = str(self.data_index[index+1][1])
-                data_dict['pose1'] = f[next_timestamp]['pose'][:]
-                data_dict['pc1'] = f[next_timestamp]['lidar'][:]
-                data_dict['gm1'] = f[next_timestamp]['ground_mask'][:]
-            elif self.flow_view:
-                print(f"[Warning]: No {self.vis_name} in {scene_id} at {timestamp}, check the data.")
-        return data_dict
-    
-from av2.geometry.se3 import SE3
-from scipy.spatial.transform import Rotation as R
-def transform_to_array(pose):
-    pose_se3 = SE3(rotation=pose[:3,:3], translation=pose[:3,3])
-    qxyzw = R.from_matrix(pose_se3.rotation).as_quat()
-    pose_array = [pose_se3.translation[0], pose_se3.translation[1], pose_se3.translation[2], \
-        qxyzw[3], qxyzw[0], qxyzw[1], qxyzw[2]]
-    return pose_array
-
-from zipfile import ZipFile
-import time, json
-def zip_res(res_folder, output_file="av2_submit.zip", leaderboard_version=2, is_supervised=False):
-    """
-    res_folder: the folder of the output results
-    """
-    all_scenes = os.listdir(res_folder)
-    start_time = time.time()
-
-    if leaderboard_version == 1:
-        with ZipFile(output_file, "w") as myzip:
-            for scene in all_scenes:
-                scene_folder = os.path.join(res_folder, scene)
-                all_logs = os.listdir(scene_folder)
-                for log in all_logs:
-                    if not log.endswith(".feather"):
-                        continue
-                    file_path = os.path.join(scene, log)
-                    myzip.write(os.path.join(res_folder, file_path), arcname=file_path)
-    else:
-        output_file = output_file.replace(".zip", f"_v{leaderboard_version}.zip") if output_file=="av2_submit.zip" else output_file
-        metadata = {"Is Supervised?": is_supervised}
-        with ZipFile(output_file, "w") as myzip:
-            myzip.writestr("metadata.json", json.dumps(metadata, indent=4))
-            for scene in all_scenes:
-                scene_folder = os.path.join(res_folder, scene)
-                relative_idx = 0
-                for log in sorted(os.listdir(scene_folder)):
-                    if not log.endswith(".feather"):
-                        continue
-                    file_path = os.path.join(scene, log)
-                    myzip.write(os.path.join(res_folder, file_path), arcname=os.path.join(scene, f"{relative_idx:010d}.feather"))
-                    relative_idx += 5
-    print(f"Time cost: {time.time()-start_time:.2f}s, check the zip file: {output_file}")
-    return output_file
\ No newline at end of file
diff --git a/src/utils/o3d_view.py b/src/utils/o3d_view.py
deleted file mode 100644
index 39ca32a..0000000
--- a/src/utils/o3d_view.py
+++ /dev/null
@@ -1,145 +0,0 @@
-'''
-# @date: 2023-1-26 16:38
-# @author: Qingwen Zhang  (https://kin-zhang.github.io/)
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# @detail:
-#  1. Play the data you want in open3d, and save the view control to json file.
-#  2. Use the json file to view the data again.
-#  3. Save the screen shot and view file for later check and animation.
-# 
-# code gits: https://gist.github.com/Kin-Zhang/77e8aa77a998f1a4f7495357843f24ef
-# 
-# CHANGELOG:
-# 2024-08-23 21:41(Qingwen): remove totally on view setting from scratch but use open3d>=0.18.0 version for set_view from json text func.
-# 2024-04-15 12:06(Qingwen): show a example json text. add hex_to_rgb, color_map_hex, color_map (for color points if needed)
-# 2024-01-27 0:41(Qingwen): update MyVisualizer class, reference from kiss-icp: https://github.com/PRBonn/kiss-icp/blob/main/python/kiss_icp/tools/visualizer.py
-'''
-
-import open3d as o3d
-import os, time
-from typing import List, Callable
-from functools import partial
-
-def hex_to_rgb(hex_color):
-    hex_color = hex_color.lstrip("#")
-    return tuple(int(hex_color[i:i + 2], 16) / 255.0 for i in (0, 2, 4))
-
-color_map_hex = ['#a6cee3', '#de2d26', '#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00',\
-                 '#cab2d6','#6a3d9a','#ffff99','#b15928', '#8dd3c7','#ffffb3','#bebada','#fb8072','#80b1d3',\
-                 '#fdb462','#b3de69','#fccde5','#d9d9d9','#bc80bd','#ccebc5','#ffed6f']
-
-color_map = [hex_to_rgb(color) for color in color_map_hex]
-
-        
-class MyVisualizer:
-    def __init__(self, view_file=None, window_title="Default", save_folder="logs/imgs"):
-        self.params = None
-        self.vis = o3d.visualization.VisualizerWithKeyCallback()
-        self.vis.create_window(window_name=window_title)
-        self.view_file = view_file
-
-        self.block_vis = True
-        self.play_crun = False
-        self.reset_bounding_box = True
-        self.save_img_folder = save_folder
-        os.makedirs(self.save_img_folder, exist_ok=True)
-        print(
-            f"\n{window_title.capitalize()} initialized. Press:\n"
-            "\t[SPACE] to pause/start\n"
-            "\t[ESC/Q] to exit\n"
-            "\t    [P] to save screen and viewpoint\n"
-            "\t    [N] to step\n"
-        )
-        self._register_key_callback(["Ā", "Q", "\x1b"], self._quit)
-        self._register_key_callback(["P"], self._save_screen)
-        self._register_key_callback([" "], self._start_stop)
-        self._register_key_callback(["N"], self._next_frame)
-
-    def show(self, assets: List):
-        self.vis.clear_geometries()
-
-        for asset in assets:
-            self.vis.add_geometry(asset)
-            if self.view_file is not None:
-                self.vis.set_view_status(open(self.view_file).read())
-
-        self.vis.update_renderer()
-        self.vis.poll_events()
-        self.vis.run()
-        self.vis.destroy_window()
-
-    def update(self, assets: List, clear: bool = True):
-        if clear:
-            self.vis.clear_geometries()
-
-        for asset in assets:
-            self.vis.add_geometry(asset, reset_bounding_box=False)
-            self.vis.update_geometry(asset)
-
-        if self.reset_bounding_box:
-            self.vis.reset_view_point(True)
-            if self.view_file is not None:
-                self.vis.set_view_status(open(self.view_file).read())
-            self.reset_bounding_box = False
-
-        self.vis.update_renderer()
-        while self.block_vis:
-            self.vis.poll_events()
-            if self.play_crun:
-                break
-        self.block_vis = not self.block_vis
-
-    def _register_key_callback(self, keys: List, callback: Callable):
-        for key in keys:
-            self.vis.register_key_callback(ord(str(key)), partial(callback))
-
-    def _next_frame(self, vis):
-        self.block_vis = not self.block_vis
-
-    def _start_stop(self, vis):
-        self.play_crun = not self.play_crun
-
-    def _quit(self, vis):
-        print("Destroying Visualizer. Thanks for using ^v^.")
-        vis.destroy_window()
-        os._exit(0)
-
-    def _save_screen(self, vis):
-        timestamp = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
-        png_file = f"{self.save_img_folder}/ScreenShot_{timestamp}.png"
-        view_json_file = f"{self.save_img_folder}/ScreenView_{timestamp}.json"
-        with open(view_json_file, 'w') as f:
-            f.write(vis.get_view_status())
-        vis.capture_screen_image(png_file)
-        print(f"ScreenShot saved to: {png_file}, Please check it.")
-
-if __name__ == "__main__":
-    json_content = """{
-	"class_name" : "ViewTrajectory",
-	"interval" : 29,
-	"is_loop" : false,
-	"trajectory" : 
-	[
-		{
-			"boundingbox_max" : [ 3.9660897254943848, 2.427476167678833, 2.55859375 ],
-			"boundingbox_min" : [ 0.55859375, 0.83203125, 0.56663715839385986 ],
-			"field_of_view" : 60.0,
-			"front" : [ 0.27236083595988803, -0.25567329763523589, -0.92760484038816615 ],
-			"lookat" : [ 2.4114965637897101, 1.8070288935660688, 1.5662280268112718 ],
-			"up" : [ -0.072779625398507866, -0.96676294585190281, 0.24509698622097265 ],
-			"zoom" : 0.47999999999999976
-		}
-	],
-	"version_major" : 1,
-	"version_minor" : 0
-}
-"""
-    # write to json file
-    view_json_file = "view.json"
-    with open(view_json_file, 'w') as f:
-        f.write(json_content)
-    sample_ply_data = o3d.data.PLYPointCloud()
-    pcd = o3d.io.read_point_cloud(sample_ply_data.path)
-
-    viz = MyVisualizer(view_json_file, window_title="Qingwen's View")
-    viz.show([pcd])
\ No newline at end of file
diff --git a/tools/README.md b/tools/README.md
deleted file mode 100644
index a479138..0000000
--- a/tools/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-Tools
----
-
-Here we introduce some tools to help you:
-- visualize the data and results.
-- convert the pretrained model from others.
-- ... More to come.
-
-## Visualization
-
-run `tools/visualization.py` to view the scene flow dataset with ground truth flow. Note the color wheel in under world coordinate.
-
-```bash
-# view gt flow
-python3 tools/visualization.py --data_dir /home/kin/data/av2/preprocess/sensor/mini --res_name flow
-
-# view est flow
-python3 tools/visualization.py --data_dir /home/kin/data/av2/preprocess/sensor/mini --res_name deflow_best
-python3 tools/visualization.py --data_dir /home/kin/data/av2/preprocess/sensor/mini --res_name seflow_best
-```
-
-Demo Effect (press `SPACE` to stop and start in the visualization window):
-
-https://github.com/user-attachments/assets/f031d1a2-2d2f-4947-a01f-834ed1c146e6
-
-## Conversion
-
-run `tools/zero2ours.py` to convert the ZeroFlow pretrained model to our codebase. 
-
-```bash
-python tools/zero2ours.py --model_path /home/kin/nsfp_distilatation_3x_49_epochs.ckpt --reference_path /home/kin/fastflow3d.ckpt --output_path /home/kin/zeroflow3x.ckpt
-```
-
-- model_path, you can download from: [kylevedder/zeroflow_weights](https://github.com/kylevedder/zeroflow_weights/tree/master/argo)
-- reference_path,  you can download fastflow3d model from: [zendo](https://zenodo.org/records/12632962)
-- output_path, the converted model path. You can then run any evaluation script and visualization script with the converted model.
\ No newline at end of file
diff --git a/tools/visualization.py b/tools/visualization.py
deleted file mode 100644
index a97c34a..0000000
--- a/tools/visualization.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""
-# Created: 2023-11-29 21:22
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-#
-# This file is part of DeFlow (https://github.com/KTH-RPL/DeFlow).
-# If you find this repo helpful, please cite the respective publication as 
-# listed on the above website.
-# 
-# Description: view scene flow dataset after preprocess.
-"""
-
-import numpy as np
-import fire, time
-from tqdm import tqdm
-
-import open3d as o3d
-import os, sys
-BASE_DIR = os.path.abspath(os.path.join( os.path.dirname( __file__ ), '..' ))
-sys.path.append(BASE_DIR)
-from src.utils.mics import HDF5Data, flow_to_rgb
-from src.utils.o3d_view import MyVisualizer, color_map
-
-
-VIEW_FILE = f"{BASE_DIR}/assets/view/av2.json"
-
-def check_flow(
-    data_dir: str ="/home/kin/data/av2/preprocess/sensor/mini",
-    res_name: str = "flow", # "flow", "flow_est"
-    start_id: int = 0,
-    point_size: float = 3.0,
-):
-    dataset = HDF5Data(data_dir, vis_name=res_name, flow_view=True)
-    o3d_vis = MyVisualizer(view_file=VIEW_FILE, window_title=f"view {'ground truth flow' if res_name == 'flow' else f'{res_name} flow'}, `SPACE` start/stop")
-
-    opt = o3d_vis.vis.get_render_option()
-    opt.background_color = np.asarray([80/255, 90/255, 110/255])
-    opt.point_size = point_size
-
-    for data_id in (pbar := tqdm(range(start_id, len(dataset)))):
-        data = dataset[data_id]
-        now_scene_id = data['scene_id']
-        pbar.set_description(f"id: {data_id}, scene_id: {now_scene_id}, timestamp: {data['timestamp']}")
-        gm0 = data['gm0']
-        pc0 = data['pc0'][~gm0]
-        
-        pcd = o3d.geometry.PointCloud()
-        pcd.points = o3d.utility.Vector3dVector(pc0[:, :3])
-        pcd.paint_uniform_color([1.0, 0.0, 0.0]) # red: pc0
-
-        pc1 = data['pc1']
-        pcd1 = o3d.geometry.PointCloud()
-        pcd1.points = o3d.utility.Vector3dVector(pc1[:, :3][~data['gm1']])
-        pcd1.paint_uniform_color([0.0, 1.0, 0.0]) # green: pc1
-
-        pcd2 = o3d.geometry.PointCloud()
-        # pcd2.points = o3d.utility.Vector3dVector(pc0[:, :3] + pose_flow) # if you want to check pose_flow
-        pcd2.points = o3d.utility.Vector3dVector(pc0[:, :3] + data[res_name][~gm0])
-        pcd2.paint_uniform_color([0.0, 0.0, 1.0]) # blue: pc0 + flow
-        o3d_vis.update([pcd, pcd1, pcd2, o3d.geometry.TriangleMesh.create_coordinate_frame(size=2)])
-
-def vis(
-    data_dir: str ="/home/kin/data/av2/preprocess/sensor/mini",
-    res_name: str = "flow", # "flow", "flow_est"
-    start_id: int = 0,
-    point_size: float = 2.0,
-):
-    dataset = HDF5Data(data_dir, vis_name=res_name, flow_view=True)
-    o3d_vis = MyVisualizer(view_file=VIEW_FILE, window_title=f"view {'ground truth flow' if res_name == 'flow' else f'{res_name} flow'}, `SPACE` start/stop")
-
-    opt = o3d_vis.vis.get_render_option()
-    # opt.background_color = np.asarray([216, 216, 216]) / 255.0
-    opt.background_color = np.asarray([80/255, 90/255, 110/255])
-    # opt.background_color = np.asarray([1, 1, 1])
-    opt.point_size = point_size
-
-    for data_id in (pbar := tqdm(range(start_id, len(dataset)))):
-        data = dataset[data_id]
-        now_scene_id = data['scene_id']
-        pbar.set_description(f"id: {data_id}, scene_id: {now_scene_id}, timestamp: {data['timestamp']}")
-
-        pc0 = data['pc0']
-        gm0 = data['gm0']
-        pose0 = data['pose0']
-        pose1 = data['pose1']
-        ego_pose = np.linalg.inv(pose1) @ pose0
-
-        pose_flow = pc0[:, :3] @ ego_pose[:3, :3].T + ego_pose[:3, 3] - pc0[:, :3]
-        
-        pcd = o3d.geometry.PointCloud()
-        if res_name == 'raw': # no result, only show **raw point cloud**
-            pcd.points = o3d.utility.Vector3dVector(pc0[:, :3])
-            pcd.paint_uniform_color([1.0, 1.0, 1.0])
-        elif res_name in ['dufo_label', 'label']:
-            labels = data[res_name]
-            pcd_i = o3d.geometry.PointCloud()
-            for label_i in np.unique(labels):
-                pcd_i.points = o3d.utility.Vector3dVector(pc0[labels == label_i][:, :3])
-                if label_i <= 0:
-                    pcd_i.paint_uniform_color([1.0, 1.0, 1.0])
-                else:
-                    pcd_i.paint_uniform_color(color_map[label_i % len(color_map)])
-                pcd += pcd_i
-        elif res_name in data:
-            pcd.points = o3d.utility.Vector3dVector(pc0[:, :3])
-            flow = data[res_name] - pose_flow # ego motion compensation here.
-            flow_color = flow_to_rgb(flow) / 255.0
-            is_dynamic = np.linalg.norm(flow, axis=1) > 0.1
-            flow_color[~is_dynamic] = [1, 1, 1]
-            flow_color[gm0] = [1, 1, 1]
-            pcd.colors = o3d.utility.Vector3dVector(flow_color)
-        o3d_vis.update([pcd, o3d.geometry.TriangleMesh.create_coordinate_frame(size=2)])
-
-if __name__ == '__main__':
-    start_time = time.time()
-    # fire.Fire(check_flow)
-    fire.Fire(vis)
-    print(f"Time used: {time.time() - start_time:.2f} s")
\ No newline at end of file
diff --git a/tools/write4conf.py b/tools/write4conf.py
deleted file mode 100644
index 60cc560..0000000
--- a/tools/write4conf.py
+++ /dev/null
@@ -1,23 +0,0 @@
-'''
-# Usage for old model file to new version code
-'''
-import torch
-import fire, time
-
-def main(
-    # download from: https://zenodo.org/records/12632962
-    model_path: str = "/home/kin/model_zoo/v2/seflow_best.ckpt", 
-    # new output weight file
-    output_path: str = "/home/kin/model_zoo/v3/seflow_best.ckpt",
-):
-    model = torch.load(model_path)
-    model_name = model['hyper_parameters']['cfg']['model']['name']
-    old_path = model['hyper_parameters']['cfg']['model']['target']['_target_']
-    new_path = old_path.replace(f"scripts.network.models.{model_name}", "src.models")
-    model['hyper_parameters']['cfg']['model']['target']['_target_'] = new_path
-    torch.save(model, output_path)
-
-if __name__ == '__main__':
-    start_time = time.time()
-    fire.Fire(main)
-    print(f"Time used: {time.time() - start_time:.2f} s")
\ No newline at end of file
diff --git a/tools/zerof2ours.py b/tools/zerof2ours.py
deleted file mode 100644
index d8e8084..0000000
--- a/tools/zerof2ours.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import torch
-from pathlib import Path
-import numpy as np
-import fire, time
-
-def main(
-    # download from: https://github.com/kylevedder/zeroflow_weights/tree/master/argo
-    model_path: str = "/home/kin/nsfp_distilatation_3x_49_epochs.ckpt", 
-    # download from: https://zenodo.org/records/12632962
-    reference_path: str = "/home/kin/fastflow3d.ckpt", 
-    # new output weight file
-    output_path: str = "/home/kin/zeroflow3x.ckpt",
-):
-    model = torch.load(model_path)
-    reference = torch.load(reference_path)
-
-    ref_model_weight = reference['state_dict']
-    real_model_weight = model['state_dict']
-    for k in real_model_weight.keys():
-        if k not in ref_model_weight.keys():
-            print(f"Warning: {k} not in reference model, not same model.")
-            exit(0)
-    
-    reference['state_dict'] = real_model_weight
-    torch.save(reference, output_path)
-
-if __name__ == '__main__':
-    start_time = time.time()
-    fire.Fire(main)
-    print(f"Time used: {time.time() - start_time:.2f} s")
\ No newline at end of file
diff --git a/train.py b/train.py
deleted file mode 100644
index 51be0b0..0000000
--- a/train.py
+++ /dev/null
@@ -1,146 +0,0 @@
-"""
-# Created: 2023-07-12 19:30
-# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
-# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
-#
-# This file is part of DeFlow (https://github.com/KTH-RPL/DeFlow) and 
-# SeFlow (https://github.com/KTH-RPL/SeFlow) projects.
-# If you find this repo helpful, please cite the respective publication as 
-# listed on the above website.
-
-# Description: Train Model
-"""
-
-import torch
-from torch.utils.data import DataLoader
-import lightning.pytorch as pl
-from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
-from lightning.pytorch.callbacks import (
-    LearningRateMonitor,
-    ModelCheckpoint
-)
-
-from omegaconf import DictConfig, OmegaConf
-import hydra, wandb, os, math
-from hydra.core.hydra_config import HydraConfig
-from pathlib import Path
-
-from src.dataset import HDF5Dataset, collate_fn_pad
-from src.trainer import ModelWrapper
-
-def precheck_cfg_valid(cfg):
-    if cfg.loss_fn == 'seflowLoss' and cfg.add_seloss is None:
-        raise ValueError("Please specify the self-supervised loss items for seflowLoss.")
-    
-    grid_size = [(cfg.point_cloud_range[3] - cfg.point_cloud_range[0]) * (1/cfg.voxel_size[0]),
-                 (cfg.point_cloud_range[4] - cfg.point_cloud_range[1]) * (1/cfg.voxel_size[1]),
-                 (cfg.point_cloud_range[5] - cfg.point_cloud_range[2]) * (1/cfg.voxel_size[2])]
-    
-    for i, dim_size in enumerate(grid_size):
-        # NOTE(Qingwen):
-        # * the range is divisible to voxel, e.g. 51.2/0.2=256 good, 51.2/0.3=170.67 wrong.
-        # * the grid size to be divisible by 8 (2^3) for three bisections for the UNet.
-        target_divisor = 8
-        if i <= 1:  # Only check x and y dimensions
-            if dim_size % target_divisor != 0:
-                adjusted_dim_size = math.ceil(dim_size / target_divisor) * target_divisor
-                suggest_range_setting = (adjusted_dim_size * cfg.voxel_size[i]) / 2
-                raise ValueError(f"Suggest x/y range setting: {suggest_range_setting:.2f} based on {cfg.voxel_size[i]}")
-        else:
-            if dim_size.is_integer() is False:
-                suggest_range_setting = (math.ceil(dim_size) * cfg.voxel_size[i]) / 2
-                raise ValueError(f"Suggest z range setting: {suggest_range_setting:.2f} or {suggest_range_setting/2:.2f} based on {cfg.voxel_size[i]}")
-    return cfg
-
-@hydra.main(version_base=None, config_path="conf", config_name="config")
-def main(cfg):
-    precheck_cfg_valid(cfg)
-    pl.seed_everything(cfg.seed, workers=True)
-
-    train_dataset = HDF5Dataset(cfg.train_data, n_frames=cfg.num_frames, dufo=(cfg.loss_fn == 'seflowLoss'))
-    train_loader = DataLoader(train_dataset,
-                              batch_size=cfg.batch_size,
-                              shuffle=True,
-                              num_workers=cfg.num_workers,
-                              collate_fn=collate_fn_pad,
-                              pin_memory=True)
-    val_loader = DataLoader(HDF5Dataset(cfg.val_data, n_frames=cfg.num_frames),
-                            batch_size=cfg.batch_size,
-                            shuffle=False,
-                            num_workers=cfg.num_workers,
-                            collate_fn=collate_fn_pad,
-                            pin_memory=True)
-                            
-    # count gpus, overwrite gpus
-    cfg.gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
-
-    output_dir = HydraConfig.get().runtime.output_dir
-    # overwrite logging folder name for SSL.
-    if cfg.loss_fn == 'seflowLoss':
-        cfg.output = cfg.output.replace(cfg.model.name, "seflow")
-        output_dir = output_dir.replace(cfg.model.name, "seflow")
-        method_name = "seflow"
-    else:
-        method_name = cfg.model.name
-
-    # FIXME: hydra output_dir with ddp run will mkdir in the parent folder. Looks like PL and Hydra trying to fix in lib.
-    # print(f"Output Directory: {output_dir} in gpu rank: {torch.cuda.current_device()}")
-    Path(os.path.join(output_dir, "checkpoints")).mkdir(parents=True, exist_ok=True)
-    
-    cfg = DictConfig(OmegaConf.to_container(cfg, resolve=True))
-    model = ModelWrapper(cfg)
-
-    callbacks = [
-        ModelCheckpoint(
-            dirpath=os.path.join(output_dir, "checkpoints"),
-            filename="{epoch:02d}_"+method_name,
-            auto_insert_metric_name=False,
-            monitor=cfg.model.val_monitor,
-            mode="min",
-            save_top_k=cfg.save_top_model,
-            save_last=True,
-        ),
-        LearningRateMonitor(logging_interval="epoch")
-    ]
-
-    if cfg.wandb_mode != "disabled":
-        logger = WandbLogger(save_dir=output_dir,
-                            entity="kth-rpl",
-                            project=f"{cfg.wandb_project_name}", 
-                            name=f"{cfg.output}",
-                            offline=(cfg.wandb_mode == "offline"),
-                            log_model=(True if cfg.wandb_mode == "online" else False))
-        logger.watch(model, log_graph=False)
-    else:
-        # check local tensorboard logging: tensorboard --logdir logs/jobs/{log folder}
-        logger = TensorBoardLogger(save_dir=output_dir, name="logs")
-
-
-    trainer = pl.Trainer(logger=logger,
-                         log_every_n_steps=50,
-                         accelerator="gpu",
-                         devices=cfg.gpus,
-                         check_val_every_n_epoch=cfg.val_every,
-                         gradient_clip_val=cfg.gradient_clip_val,
-                         strategy="ddp_find_unused_parameters_false" if cfg.gpus > 1 else "auto",
-                         callbacks=callbacks,
-                         max_epochs=cfg.epochs,
-                         sync_batchnorm=cfg.sync_bn)
-    
-    
-    if trainer.global_rank == 0:
-        print("\n"+"-"*40)
-        print("Initiating wandb and trainer successfully.  ^V^ ")
-        print(f"We will use {cfg.gpus} GPUs to train the model. Check the checkpoints in {output_dir} checkpoints folder.")
-        print("Total Train Dataset Size: ", len(train_dataset))
-        if cfg.add_seloss is not None and cfg.loss_fn == 'seflowLoss':
-            print(f"Note: We are in **self-supervised** training now. No ground truth label is used.")
-            print(f"We will use these loss items in {cfg.loss_fn}: {cfg.add_seloss}")
-        print("-"*40+"\n")
-
-    # NOTE(Qingwen): search & check: def training_step(self, batch, batch_idx)
-    trainer.fit(model, train_dataloaders = train_loader, val_dataloaders = val_loader, ckpt_path = cfg.checkpoint)
-    wandb.finish()
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file