| 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ |
| 3 | |
| 4 | #include <linux/ns_common.h> |
| 5 | #include <linux/nstree.h> |
| 6 | #include <linux/proc_ns.h> |
| 7 | #include <linux/user_namespace.h> |
| 8 | #include <linux/vfsdebug.h> |
| 9 | |
| 10 | #ifdef CONFIG_DEBUG_VFS |
| 11 | static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) |
| 12 | { |
| 13 | switch (ns->ns_type) { |
| 14 | #ifdef CONFIG_CGROUPS |
| 15 | case CLONE_NEWCGROUP: |
| 16 | VFS_WARN_ON_ONCE(ops != &cgroupns_operations); |
| 17 | break; |
| 18 | #endif |
| 19 | #ifdef CONFIG_IPC_NS |
| 20 | case CLONE_NEWIPC: |
| 21 | VFS_WARN_ON_ONCE(ops != &ipcns_operations); |
| 22 | break; |
| 23 | #endif |
| 24 | case CLONE_NEWNS: |
| 25 | VFS_WARN_ON_ONCE(ops != &mntns_operations); |
| 26 | break; |
| 27 | #ifdef CONFIG_NET_NS |
| 28 | case CLONE_NEWNET: |
| 29 | VFS_WARN_ON_ONCE(ops != &netns_operations); |
| 30 | break; |
| 31 | #endif |
| 32 | #ifdef CONFIG_PID_NS |
| 33 | case CLONE_NEWPID: |
| 34 | VFS_WARN_ON_ONCE(ops != &pidns_operations); |
| 35 | break; |
| 36 | #endif |
| 37 | #ifdef CONFIG_TIME_NS |
| 38 | case CLONE_NEWTIME: |
| 39 | VFS_WARN_ON_ONCE(ops != &timens_operations); |
| 40 | break; |
| 41 | #endif |
| 42 | #ifdef CONFIG_USER_NS |
| 43 | case CLONE_NEWUSER: |
| 44 | VFS_WARN_ON_ONCE(ops != &userns_operations); |
| 45 | break; |
| 46 | #endif |
| 47 | #ifdef CONFIG_UTS_NS |
| 48 | case CLONE_NEWUTS: |
| 49 | VFS_WARN_ON_ONCE(ops != &utsns_operations); |
| 50 | break; |
| 51 | #endif |
| 52 | } |
| 53 | } |
| 54 | #endif |
| 55 | |
| 56 | int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) |
| 57 | { |
| 58 | int ret = 0; |
| 59 | |
| 60 | refcount_set(r: &ns->__ns_ref, n: 1); |
| 61 | ns->stashed = NULL; |
| 62 | ns->ops = ops; |
| 63 | ns->ns_id = 0; |
| 64 | ns->ns_type = ns_type; |
| 65 | ns_tree_node_init(node: &ns->ns_tree_node); |
| 66 | ns_tree_node_init(node: &ns->ns_unified_node); |
| 67 | ns_tree_node_init(node: &ns->ns_owner_node); |
| 68 | ns_tree_root_init(root: &ns->ns_owner_root); |
| 69 | |
| 70 | #ifdef CONFIG_DEBUG_VFS |
| 71 | ns_debug(ns, ops); |
| 72 | #endif |
| 73 | |
| 74 | if (inum) |
| 75 | ns->inum = inum; |
| 76 | else |
| 77 | ret = proc_alloc_inum(pino: &ns->inum); |
| 78 | if (ret) |
| 79 | return ret; |
| 80 | /* |
| 81 | * Tree ref starts at 0. It's incremented when namespace enters |
| 82 | * active use (installed in nsproxy) and decremented when all |
| 83 | * active uses are gone. Initial namespaces are always active. |
| 84 | */ |
| 85 | if (is_ns_init_inum(ns)) |
| 86 | atomic_set(v: &ns->__ns_ref_active, i: 1); |
| 87 | else |
| 88 | atomic_set(v: &ns->__ns_ref_active, i: 0); |
| 89 | return 0; |
| 90 | } |
| 91 | |
| 92 | void __ns_common_free(struct ns_common *ns) |
| 93 | { |
| 94 | proc_free_inum(inum: ns->inum); |
| 95 | } |
| 96 | |
| 97 | struct ns_common *__must_check ns_owner(struct ns_common *ns) |
| 98 | { |
| 99 | struct user_namespace *owner; |
| 100 | |
| 101 | if (unlikely(!ns->ops)) |
| 102 | return NULL; |
| 103 | VFS_WARN_ON_ONCE(!ns->ops->owner); |
| 104 | owner = ns->ops->owner(ns); |
| 105 | VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); |
| 106 | if (!owner) |
| 107 | return NULL; |
| 108 | /* Skip init_user_ns as it's always active */ |
| 109 | if (owner == &init_user_ns) |
| 110 | return NULL; |
| 111 | return to_ns_common(owner); |
| 112 | } |
| 113 | |
| 114 | /* |
| 115 | * The active reference count works by having each namespace that gets |
| 116 | * created take a single active reference on its owning user namespace. |
| 117 | * That single reference is only released once the child namespace's |
| 118 | * active count itself goes down. |
| 119 | * |
| 120 | * A regular namespace tree might look as follow: |
| 121 | * Legend: |
| 122 | * + : adding active reference |
| 123 | * - : dropping active reference |
| 124 | * x : always active (initial namespace) |
| 125 | * |
| 126 | * |
| 127 | * net_ns pid_ns |
| 128 | * \ / |
| 129 | * + + |
| 130 | * user_ns1 (2) |
| 131 | * | |
| 132 | * ipc_ns | uts_ns |
| 133 | * \ | / |
| 134 | * + + + |
| 135 | * user_ns2 (3) |
| 136 | * | |
| 137 | * cgroup_ns | mnt_ns |
| 138 | * \ | / |
| 139 | * x x x |
| 140 | * init_user_ns (1) |
| 141 | * |
| 142 | * If both net_ns and pid_ns put their last active reference on |
| 143 | * themselves it will cascade to user_ns1 dropping its own active |
| 144 | * reference and dropping one active reference on user_ns2: |
| 145 | * |
| 146 | * net_ns pid_ns |
| 147 | * \ / |
| 148 | * - - |
| 149 | * user_ns1 (0) |
| 150 | * | |
| 151 | * ipc_ns | uts_ns |
| 152 | * \ | / |
| 153 | * + - + |
| 154 | * user_ns2 (2) |
| 155 | * | |
| 156 | * cgroup_ns | mnt_ns |
| 157 | * \ | / |
| 158 | * x x x |
| 159 | * init_user_ns (1) |
| 160 | * |
| 161 | * The iteration stops once we reach a namespace that still has active |
| 162 | * references. |
| 163 | */ |
| 164 | void __ns_ref_active_put(struct ns_common *ns) |
| 165 | { |
| 166 | /* Initial namespaces are always active. */ |
| 167 | if (is_ns_init_id(ns)) |
| 168 | return; |
| 169 | |
| 170 | if (!atomic_dec_and_test(v: &ns->__ns_ref_active)) { |
| 171 | VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); |
| 172 | return; |
| 173 | } |
| 174 | |
| 175 | VFS_WARN_ON_ONCE(is_ns_init_id(ns)); |
| 176 | VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); |
| 177 | |
| 178 | for (;;) { |
| 179 | ns = ns_owner(ns); |
| 180 | if (!ns) |
| 181 | return; |
| 182 | VFS_WARN_ON_ONCE(is_ns_init_id(ns)); |
| 183 | if (!atomic_dec_and_test(v: &ns->__ns_ref_active)) { |
| 184 | VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); |
| 185 | return; |
| 186 | } |
| 187 | } |
| 188 | } |
| 189 | |
| 190 | /* |
| 191 | * The active reference count works by having each namespace that gets |
| 192 | * created take a single active reference on its owning user namespace. |
| 193 | * That single reference is only released once the child namespace's |
| 194 | * active count itself goes down. This makes it possible to efficiently |
| 195 | * resurrect a namespace tree: |
| 196 | * |
| 197 | * A regular namespace tree might look as follow: |
| 198 | * Legend: |
| 199 | * + : adding active reference |
| 200 | * - : dropping active reference |
| 201 | * x : always active (initial namespace) |
| 202 | * |
| 203 | * |
| 204 | * net_ns pid_ns |
| 205 | * \ / |
| 206 | * + + |
| 207 | * user_ns1 (2) |
| 208 | * | |
| 209 | * ipc_ns | uts_ns |
| 210 | * \ | / |
| 211 | * + + + |
| 212 | * user_ns2 (3) |
| 213 | * | |
| 214 | * cgroup_ns | mnt_ns |
| 215 | * \ | / |
| 216 | * x x x |
| 217 | * init_user_ns (1) |
| 218 | * |
| 219 | * If both net_ns and pid_ns put their last active reference on |
| 220 | * themselves it will cascade to user_ns1 dropping its own active |
| 221 | * reference and dropping one active reference on user_ns2: |
| 222 | * |
| 223 | * net_ns pid_ns |
| 224 | * \ / |
| 225 | * - - |
| 226 | * user_ns1 (0) |
| 227 | * | |
| 228 | * ipc_ns | uts_ns |
| 229 | * \ | / |
| 230 | * + - + |
| 231 | * user_ns2 (2) |
| 232 | * | |
| 233 | * cgroup_ns | mnt_ns |
| 234 | * \ | / |
| 235 | * x x x |
| 236 | * init_user_ns (1) |
| 237 | * |
| 238 | * Assume the whole tree is dead but all namespaces are still active: |
| 239 | * |
| 240 | * net_ns pid_ns |
| 241 | * \ / |
| 242 | * - - |
| 243 | * user_ns1 (0) |
| 244 | * | |
| 245 | * ipc_ns | uts_ns |
| 246 | * \ | / |
| 247 | * - - - |
| 248 | * user_ns2 (0) |
| 249 | * | |
| 250 | * cgroup_ns | mnt_ns |
| 251 | * \ | / |
| 252 | * x x x |
| 253 | * init_user_ns (1) |
| 254 | * |
| 255 | * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): |
| 256 | * |
| 257 | * net_ns pid_ns |
| 258 | * \ / |
| 259 | * + - |
| 260 | * user_ns1 (0) |
| 261 | * | |
| 262 | * ipc_ns | uts_ns |
| 263 | * \ | / |
| 264 | * - + - |
| 265 | * user_ns2 (0) |
| 266 | * | |
| 267 | * cgroup_ns | mnt_ns |
| 268 | * \ | / |
| 269 | * x x x |
| 270 | * init_user_ns (1) |
| 271 | * |
| 272 | * If net_ns had a zero reference count and we bumped it we also need to |
| 273 | * take another reference on its owning user namespace. Similarly, if |
| 274 | * pid_ns had a zero reference count it also needs to take another |
| 275 | * reference on its owning user namespace. So both net_ns and pid_ns |
| 276 | * will each have their own reference on the owning user namespace. |
| 277 | * |
| 278 | * If the owning user namespace user_ns1 had a zero reference count then |
| 279 | * it also needs to take another reference on its owning user namespace |
| 280 | * and so on. |
| 281 | */ |
| 282 | void __ns_ref_active_get(struct ns_common *ns) |
| 283 | { |
| 284 | int prev; |
| 285 | |
| 286 | /* Initial namespaces are always active. */ |
| 287 | if (is_ns_init_id(ns)) |
| 288 | return; |
| 289 | |
| 290 | /* If we didn't resurrect the namespace we're done. */ |
| 291 | prev = atomic_fetch_add(i: 1, v: &ns->__ns_ref_active); |
| 292 | VFS_WARN_ON_ONCE(prev < 0); |
| 293 | if (likely(prev)) |
| 294 | return; |
| 295 | |
| 296 | /* |
| 297 | * We did resurrect it. Walk the ownership hierarchy upwards |
| 298 | * until we found an owning user namespace that is active. |
| 299 | */ |
| 300 | for (;;) { |
| 301 | ns = ns_owner(ns); |
| 302 | if (!ns) |
| 303 | return; |
| 304 | |
| 305 | VFS_WARN_ON_ONCE(is_ns_init_id(ns)); |
| 306 | prev = atomic_fetch_add(i: 1, v: &ns->__ns_ref_active); |
| 307 | VFS_WARN_ON_ONCE(prev < 0); |
| 308 | if (likely(prev)) |
| 309 | return; |
| 310 | } |
| 311 | } |
| 312 | |