| 1 | // SPDX-License-Identifier: GPL-2.0+ |
| 2 | /* |
| 3 | * Restartable sequences system call |
| 4 | * |
| 5 | * Copyright (C) 2015, Google, Inc., |
| 6 | * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> |
| 7 | * Copyright (C) 2015-2018, EfficiOS Inc., |
| 8 | * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> |
| 9 | */ |
| 10 | |
| 11 | /* |
| 12 | * Restartable sequences are a lightweight interface that allows |
| 13 | * user-level code to be executed atomically relative to scheduler |
| 14 | * preemption and signal delivery. Typically used for implementing |
| 15 | * per-cpu operations. |
| 16 | * |
| 17 | * It allows user-space to perform update operations on per-cpu data |
| 18 | * without requiring heavy-weight atomic operations. |
| 19 | * |
| 20 | * Detailed algorithm of rseq user-space assembly sequences: |
| 21 | * |
| 22 | * init(rseq_cs) |
| 23 | * cpu = TLS->rseq::cpu_id_start |
| 24 | * [1] TLS->rseq::rseq_cs = rseq_cs |
| 25 | * [start_ip] ---------------------------- |
| 26 | * [2] if (cpu != TLS->rseq::cpu_id) |
| 27 | * goto abort_ip; |
| 28 | * [3] <last_instruction_in_cs> |
| 29 | * [post_commit_ip] ---------------------------- |
| 30 | * |
| 31 | * The address of jump target abort_ip must be outside the critical |
| 32 | * region, i.e.: |
| 33 | * |
| 34 | * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] |
| 35 | * |
| 36 | * Steps [2]-[3] (inclusive) need to be a sequence of instructions in |
| 37 | * userspace that can handle being interrupted between any of those |
| 38 | * instructions, and then resumed to the abort_ip. |
| 39 | * |
| 40 | * 1. Userspace stores the address of the struct rseq_cs assembly |
| 41 | * block descriptor into the rseq_cs field of the registered |
| 42 | * struct rseq TLS area. This update is performed through a single |
| 43 | * store within the inline assembly instruction sequence. |
| 44 | * [start_ip] |
| 45 | * |
| 46 | * 2. Userspace tests to check whether the current cpu_id field match |
| 47 | * the cpu number loaded before start_ip, branching to abort_ip |
| 48 | * in case of a mismatch. |
| 49 | * |
| 50 | * If the sequence is preempted or interrupted by a signal |
| 51 | * at or after start_ip and before post_commit_ip, then the kernel |
| 52 | * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return |
| 53 | * ip to abort_ip before returning to user-space, so the preempted |
| 54 | * execution resumes at abort_ip. |
| 55 | * |
| 56 | * 3. Userspace critical section final instruction before |
| 57 | * post_commit_ip is the commit. The critical section is |
| 58 | * self-terminating. |
| 59 | * [post_commit_ip] |
| 60 | * |
| 61 | * 4. <success> |
| 62 | * |
| 63 | * On failure at [2], or if interrupted by preempt or signal delivery |
| 64 | * between [1] and [3]: |
| 65 | * |
| 66 | * [abort_ip] |
| 67 | * F1. <failure> |
| 68 | */ |
| 69 | |
| 70 | /* Required to select the proper per_cpu ops for rseq_stats_inc() */ |
| 71 | #define RSEQ_BUILD_SLOW_PATH |
| 72 | |
| 73 | #include <linux/debugfs.h> |
| 74 | #include <linux/ratelimit.h> |
| 75 | #include <linux/rseq_entry.h> |
| 76 | #include <linux/sched.h> |
| 77 | #include <linux/syscalls.h> |
| 78 | #include <linux/uaccess.h> |
| 79 | #include <linux/types.h> |
| 80 | #include <asm/ptrace.h> |
| 81 | |
| 82 | #define CREATE_TRACE_POINTS |
| 83 | #include <trace/events/rseq.h> |
| 84 | |
| 85 | DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); |
| 86 | |
| 87 | static inline void rseq_control_debug(bool on) |
| 88 | { |
| 89 | if (on) |
| 90 | static_branch_enable(&rseq_debug_enabled); |
| 91 | else |
| 92 | static_branch_disable(&rseq_debug_enabled); |
| 93 | } |
| 94 | |
| 95 | static int __init rseq_setup_debug(char *str) |
| 96 | { |
| 97 | bool on; |
| 98 | |
| 99 | if (kstrtobool(s: str, res: &on)) |
| 100 | return -EINVAL; |
| 101 | rseq_control_debug(on); |
| 102 | return 1; |
| 103 | } |
| 104 | __setup("rseq_debug=" , rseq_setup_debug); |
| 105 | |
| 106 | #ifdef CONFIG_TRACEPOINTS |
| 107 | /* |
| 108 | * Out of line, so the actual update functions can be in a header to be |
| 109 | * inlined into the exit to user code. |
| 110 | */ |
| 111 | void __rseq_trace_update(struct task_struct *t) |
| 112 | { |
| 113 | trace_rseq_update(t); |
| 114 | } |
| 115 | |
| 116 | void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, |
| 117 | unsigned long offset, unsigned long abort_ip) |
| 118 | { |
| 119 | trace_rseq_ip_fixup(regs_ip: ip, start_ip, post_commit_offset: offset, abort_ip); |
| 120 | } |
| 121 | #endif /* CONFIG_TRACEPOINTS */ |
| 122 | |
| 123 | #ifdef CONFIG_DEBUG_FS |
| 124 | #ifdef CONFIG_RSEQ_STATS |
| 125 | DEFINE_PER_CPU(struct rseq_stats, rseq_stats); |
| 126 | |
| 127 | static int rseq_stats_show(struct seq_file *m, void *p) |
| 128 | { |
| 129 | struct rseq_stats stats = { }; |
| 130 | unsigned int cpu; |
| 131 | |
| 132 | for_each_possible_cpu(cpu) { |
| 133 | stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); |
| 134 | stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); |
| 135 | stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); |
| 136 | stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); |
| 137 | stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); |
| 138 | stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); |
| 139 | stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); |
| 140 | stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); |
| 141 | } |
| 142 | |
| 143 | seq_printf(m, fmt: "exit: %16lu\n" , stats.exit); |
| 144 | seq_printf(m, fmt: "signal: %16lu\n" , stats.signal); |
| 145 | seq_printf(m, fmt: "slowp: %16lu\n" , stats.slowpath); |
| 146 | seq_printf(m, fmt: "fastp: %16lu\n" , stats.fastpath); |
| 147 | seq_printf(m, fmt: "ids: %16lu\n" , stats.ids); |
| 148 | seq_printf(m, fmt: "cs: %16lu\n" , stats.cs); |
| 149 | seq_printf(m, fmt: "clear: %16lu\n" , stats.clear); |
| 150 | seq_printf(m, fmt: "fixup: %16lu\n" , stats.fixup); |
| 151 | return 0; |
| 152 | } |
| 153 | |
| 154 | static int rseq_stats_open(struct inode *inode, struct file *file) |
| 155 | { |
| 156 | return single_open(file, rseq_stats_show, inode->i_private); |
| 157 | } |
| 158 | |
| 159 | static const struct file_operations stat_ops = { |
| 160 | .open = rseq_stats_open, |
| 161 | .read = seq_read, |
| 162 | .llseek = seq_lseek, |
| 163 | .release = single_release, |
| 164 | }; |
| 165 | |
| 166 | static int __init rseq_stats_init(struct dentry *root_dir) |
| 167 | { |
| 168 | debugfs_create_file("stats" , 0444, root_dir, NULL, &stat_ops); |
| 169 | return 0; |
| 170 | } |
| 171 | #else |
| 172 | static inline void rseq_stats_init(struct dentry *root_dir) { } |
| 173 | #endif /* CONFIG_RSEQ_STATS */ |
| 174 | |
| 175 | static int rseq_debug_show(struct seq_file *m, void *p) |
| 176 | { |
| 177 | bool on = static_branch_unlikely(&rseq_debug_enabled); |
| 178 | |
| 179 | seq_printf(m, fmt: "%d\n" , on); |
| 180 | return 0; |
| 181 | } |
| 182 | |
| 183 | static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, |
| 184 | size_t count, loff_t *ppos) |
| 185 | { |
| 186 | bool on; |
| 187 | |
| 188 | if (kstrtobool_from_user(s: ubuf, count, res: &on)) |
| 189 | return -EINVAL; |
| 190 | |
| 191 | rseq_control_debug(on); |
| 192 | return count; |
| 193 | } |
| 194 | |
| 195 | static int rseq_debug_open(struct inode *inode, struct file *file) |
| 196 | { |
| 197 | return single_open(file, rseq_debug_show, inode->i_private); |
| 198 | } |
| 199 | |
| 200 | static const struct file_operations debug_ops = { |
| 201 | .open = rseq_debug_open, |
| 202 | .read = seq_read, |
| 203 | .write = rseq_debug_write, |
| 204 | .llseek = seq_lseek, |
| 205 | .release = single_release, |
| 206 | }; |
| 207 | |
| 208 | static int __init rseq_debugfs_init(void) |
| 209 | { |
| 210 | struct dentry *root_dir = debugfs_create_dir(name: "rseq" , NULL); |
| 211 | |
| 212 | debugfs_create_file("debug" , 0644, root_dir, NULL, &debug_ops); |
| 213 | rseq_stats_init(root_dir); |
| 214 | return 0; |
| 215 | } |
| 216 | __initcall(rseq_debugfs_init); |
| 217 | #endif /* CONFIG_DEBUG_FS */ |
| 218 | |
| 219 | static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) |
| 220 | { |
| 221 | return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); |
| 222 | } |
| 223 | |
| 224 | static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) |
| 225 | { |
| 226 | struct rseq __user *urseq = t->rseq.usrptr; |
| 227 | u64 csaddr; |
| 228 | |
| 229 | scoped_user_read_access(urseq, efault) |
| 230 | unsafe_get_user(csaddr, &urseq->rseq_cs, efault); |
| 231 | if (likely(!csaddr)) |
| 232 | return true; |
| 233 | return rseq_update_user_cs(t, regs, csaddr); |
| 234 | efault: |
| 235 | return false; |
| 236 | } |
| 237 | |
| 238 | static void rseq_slowpath_update_usr(struct pt_regs *regs) |
| 239 | { |
| 240 | /* |
| 241 | * Preserve rseq state and user_irq state. The generic entry code |
| 242 | * clears user_irq on the way out, the non-generic entry |
| 243 | * architectures are not having user_irq. |
| 244 | */ |
| 245 | const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; |
| 246 | struct task_struct *t = current; |
| 247 | struct rseq_ids ids; |
| 248 | u32 node_id; |
| 249 | bool event; |
| 250 | |
| 251 | if (unlikely(t->flags & PF_EXITING)) |
| 252 | return; |
| 253 | |
| 254 | rseq_stat_inc(rseq_stats.slowpath); |
| 255 | |
| 256 | /* |
| 257 | * Read and clear the event pending bit first. If the task |
| 258 | * was not preempted or migrated or a signal is on the way, |
| 259 | * there is no point in doing any of the heavy lifting here |
| 260 | * on production kernels. In that case TIF_NOTIFY_RESUME |
| 261 | * was raised by some other functionality. |
| 262 | * |
| 263 | * This is correct because the read/clear operation is |
| 264 | * guarded against scheduler preemption, which makes it CPU |
| 265 | * local atomic. If the task is preempted right after |
| 266 | * re-enabling preemption then TIF_NOTIFY_RESUME is set |
| 267 | * again and this function is invoked another time _before_ |
| 268 | * the task is able to return to user mode. |
| 269 | * |
| 270 | * On a debug kernel, invoke the fixup code unconditionally |
| 271 | * with the result handed in to allow the detection of |
| 272 | * inconsistencies. |
| 273 | */ |
| 274 | scoped_guard(irq) { |
| 275 | event = t->rseq.event.sched_switch; |
| 276 | t->rseq.event.all &= evt_mask.all; |
| 277 | ids.cpu_id = task_cpu(p: t); |
| 278 | ids.mm_cid = task_mm_cid(t); |
| 279 | } |
| 280 | |
| 281 | if (!event) |
| 282 | return; |
| 283 | |
| 284 | node_id = cpu_to_node(cpu: ids.cpu_id); |
| 285 | |
| 286 | if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { |
| 287 | /* |
| 288 | * Clear the errors just in case this might survive magically, but |
| 289 | * leave the rest intact. |
| 290 | */ |
| 291 | t->rseq.event.error = 0; |
| 292 | force_sig(SIGSEGV); |
| 293 | } |
| 294 | } |
| 295 | |
| 296 | void __rseq_handle_slowpath(struct pt_regs *regs) |
| 297 | { |
| 298 | /* |
| 299 | * If invoked from hypervisors before entering the guest via |
| 300 | * resume_user_mode_work(), then @regs is a NULL pointer. |
| 301 | * |
| 302 | * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises |
| 303 | * it before returning from the ioctl() to user space when |
| 304 | * rseq_event.sched_switch is set. |
| 305 | * |
| 306 | * So it's safe to ignore here instead of pointlessly updating it |
| 307 | * in the vcpu_run() loop. |
| 308 | */ |
| 309 | if (!regs) |
| 310 | return; |
| 311 | |
| 312 | rseq_slowpath_update_usr(regs); |
| 313 | } |
| 314 | |
| 315 | void __rseq_signal_deliver(int sig, struct pt_regs *regs) |
| 316 | { |
| 317 | rseq_stat_inc(rseq_stats.signal); |
| 318 | /* |
| 319 | * Don't update IDs, they are handled on exit to user if |
| 320 | * necessary. The important thing is to abort a critical section of |
| 321 | * the interrupted context as after this point the instruction |
| 322 | * pointer in @regs points to the signal handler. |
| 323 | */ |
| 324 | if (unlikely(!rseq_handle_cs(current, regs))) { |
| 325 | /* |
| 326 | * Clear the errors just in case this might survive |
| 327 | * magically, but leave the rest intact. |
| 328 | */ |
| 329 | current->rseq.event.error = 0; |
| 330 | force_sigsegv(sig); |
| 331 | } |
| 332 | } |
| 333 | |
| 334 | /* |
| 335 | * Terminate the process if a syscall is issued within a restartable |
| 336 | * sequence. |
| 337 | */ |
| 338 | void __rseq_debug_syscall_return(struct pt_regs *regs) |
| 339 | { |
| 340 | struct task_struct *t = current; |
| 341 | u64 csaddr; |
| 342 | |
| 343 | if (!t->rseq.event.has_rseq) |
| 344 | return; |
| 345 | if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) |
| 346 | goto fail; |
| 347 | if (likely(!csaddr)) |
| 348 | return; |
| 349 | if (unlikely(csaddr >= TASK_SIZE)) |
| 350 | goto fail; |
| 351 | if (rseq_debug_update_user_cs(t, regs, csaddr)) |
| 352 | return; |
| 353 | fail: |
| 354 | force_sig(SIGSEGV); |
| 355 | } |
| 356 | |
| 357 | #ifdef CONFIG_DEBUG_RSEQ |
| 358 | /* Kept around to keep GENERIC_ENTRY=n architectures supported. */ |
| 359 | void rseq_syscall(struct pt_regs *regs) |
| 360 | { |
| 361 | __rseq_debug_syscall_return(regs); |
| 362 | } |
| 363 | #endif |
| 364 | |
| 365 | static bool rseq_reset_ids(void) |
| 366 | { |
| 367 | struct rseq_ids ids = { |
| 368 | .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, |
| 369 | .mm_cid = 0, |
| 370 | }; |
| 371 | |
| 372 | /* |
| 373 | * If this fails, terminate it because this leaves the kernel in |
| 374 | * stupid state as exit to user space will try to fixup the ids |
| 375 | * again. |
| 376 | */ |
| 377 | if (rseq_set_ids(current, ids: &ids, node_id: 0)) |
| 378 | return true; |
| 379 | |
| 380 | force_sig(SIGSEGV); |
| 381 | return false; |
| 382 | } |
| 383 | |
| 384 | /* The original rseq structure size (including padding) is 32 bytes. */ |
| 385 | #define ORIG_RSEQ_SIZE 32 |
| 386 | |
| 387 | /* |
| 388 | * sys_rseq - setup restartable sequences for caller thread. |
| 389 | */ |
| 390 | SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) |
| 391 | { |
| 392 | if (flags & RSEQ_FLAG_UNREGISTER) { |
| 393 | if (flags & ~RSEQ_FLAG_UNREGISTER) |
| 394 | return -EINVAL; |
| 395 | /* Unregister rseq for current thread. */ |
| 396 | if (current->rseq.usrptr != rseq || !current->rseq.usrptr) |
| 397 | return -EINVAL; |
| 398 | if (rseq_len != current->rseq.len) |
| 399 | return -EINVAL; |
| 400 | if (current->rseq.sig != sig) |
| 401 | return -EPERM; |
| 402 | if (!rseq_reset_ids()) |
| 403 | return -EFAULT; |
| 404 | rseq_reset(current); |
| 405 | return 0; |
| 406 | } |
| 407 | |
| 408 | if (unlikely(flags)) |
| 409 | return -EINVAL; |
| 410 | |
| 411 | if (current->rseq.usrptr) { |
| 412 | /* |
| 413 | * If rseq is already registered, check whether |
| 414 | * the provided address differs from the prior |
| 415 | * one. |
| 416 | */ |
| 417 | if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) |
| 418 | return -EINVAL; |
| 419 | if (current->rseq.sig != sig) |
| 420 | return -EPERM; |
| 421 | /* Already registered. */ |
| 422 | return -EBUSY; |
| 423 | } |
| 424 | |
| 425 | /* |
| 426 | * If there was no rseq previously registered, ensure the provided rseq |
| 427 | * is properly aligned, as communcated to user-space through the ELF |
| 428 | * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq |
| 429 | * size, the required alignment is the original struct rseq alignment. |
| 430 | * |
| 431 | * In order to be valid, rseq_len is either the original rseq size, or |
| 432 | * large enough to contain all supported fields, as communicated to |
| 433 | * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. |
| 434 | */ |
| 435 | if (rseq_len < ORIG_RSEQ_SIZE || |
| 436 | (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || |
| 437 | (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || |
| 438 | rseq_len < offsetof(struct rseq, end)))) |
| 439 | return -EINVAL; |
| 440 | if (!access_ok(rseq, rseq_len)) |
| 441 | return -EFAULT; |
| 442 | |
| 443 | scoped_user_write_access(rseq, efault) { |
| 444 | /* |
| 445 | * If the rseq_cs pointer is non-NULL on registration, clear it to |
| 446 | * avoid a potential segfault on return to user-space. The proper thing |
| 447 | * to do would have been to fail the registration but this would break |
| 448 | * older libcs that reuse the rseq area for new threads without |
| 449 | * clearing the fields. Don't bother reading it, just reset it. |
| 450 | */ |
| 451 | unsafe_put_user(0UL, &rseq->rseq_cs, efault); |
| 452 | /* Initialize IDs in user space */ |
| 453 | unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); |
| 454 | unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); |
| 455 | unsafe_put_user(0U, &rseq->node_id, efault); |
| 456 | unsafe_put_user(0U, &rseq->mm_cid, efault); |
| 457 | } |
| 458 | |
| 459 | /* |
| 460 | * Activate the registration by setting the rseq area address, length |
| 461 | * and signature in the task struct. |
| 462 | */ |
| 463 | current->rseq.usrptr = rseq; |
| 464 | current->rseq.len = rseq_len; |
| 465 | current->rseq.sig = sig; |
| 466 | |
| 467 | /* |
| 468 | * If rseq was previously inactive, and has just been |
| 469 | * registered, ensure the cpu_id_start and cpu_id fields |
| 470 | * are updated before returning to user-space. |
| 471 | */ |
| 472 | current->rseq.event.has_rseq = true; |
| 473 | rseq_force_update(); |
| 474 | return 0; |
| 475 | |
| 476 | efault: |
| 477 | return -EFAULT; |
| 478 | } |
| 479 | |