--- zzzz-none-000/linux-3.10.107/kernel/sys.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/kernel/sys.c 2021-02-04 17:41:59.000000000 +0000 @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -63,28 +62,28 @@ #include #ifndef SET_UNALIGN_CTL -# define SET_UNALIGN_CTL(a,b) (-EINVAL) +# define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef GET_UNALIGN_CTL -# define GET_UNALIGN_CTL(a,b) (-EINVAL) +# define GET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEMU_CTL -# define SET_FPEMU_CTL(a,b) (-EINVAL) +# define SET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEMU_CTL -# define GET_FPEMU_CTL(a,b) (-EINVAL) +# define GET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEXC_CTL -# define SET_FPEXC_CTL(a,b) (-EINVAL) +# define SET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEXC_CTL -# define GET_FPEXC_CTL(a,b) (-EINVAL) +# define GET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_ENDIAN -# define GET_ENDIAN(a,b) (-EINVAL) +# define GET_ENDIAN(a, b) (-EINVAL) #endif #ifndef SET_ENDIAN -# define SET_ENDIAN(a,b) (-EINVAL) +# define SET_ENDIAN(a, b) (-EINVAL) #endif #ifndef GET_TSC_CTL # define GET_TSC_CTL(a) (-EINVAL) @@ -92,6 +91,18 @@ #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif +#ifndef MPX_ENABLE_MANAGEMENT +# define MPX_ENABLE_MANAGEMENT() (-EINVAL) +#endif +#ifndef MPX_DISABLE_MANAGEMENT +# define MPX_DISABLE_MANAGEMENT() (-EINVAL) +#endif +#ifndef GET_FP_MODE +# define GET_FP_MODE(a) (-EINVAL) +#endif +#ifndef SET_FP_MODE +# define SET_FP_MODE(a,b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -116,20 +127,6 @@ EXPORT_SYMBOL(fs_overflowgid); /* - * this indicates whether you can reboot with ctrl-alt-del: the default is yes - */ - -int C_A_D = 1; -struct pid *cad_pid; -EXPORT_SYMBOL(cad_pid); - -/* - * If set, this is used for preparing the system to power off. - */ - -void (*pm_power_off_prepare)(void); - -/* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. * @@ -189,47 +186,48 @@ /* normalize: avoid signed division (rounding problems) */ error = -ESRCH; - if (niceval < -20) - niceval = -20; - if (niceval > 19) - niceval = 19; + if (niceval < MIN_NICE) + niceval = MIN_NICE; + if (niceval > MAX_NICE) + niceval = MAX_NICE; rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) - error = set_one_prio(p, niceval, error); - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - error = set_one_prio(p, niceval, error); - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - uid = make_kuid(cred->user_ns, who); - user = cred->user; - if (!who) - uid = cred->uid; - else if (!uid_eq(uid, cred->uid) && - !(user = find_user(uid))) + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + error = set_one_prio(p, niceval, error); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + uid = make_kuid(cred->user_ns, who); + user = cred->user; + if (!who) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid)) { + user = find_user(uid); + if (!user) goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) - error = set_one_prio(p, niceval, error); - } while_each_thread(g, p); - if (!uid_eq(uid, cred->uid)) - free_uid(user); /* For find_user() */ - break; + } + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) + error = set_one_prio(p, niceval, error); + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) + free_uid(user); /* For find_user() */ + break; } out_unlock: read_unlock(&tasklist_lock); @@ -259,47 +257,48 @@ rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) { - niceval = 20 - task_nice(p); + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) { + niceval = nice_to_rlimit(task_nice(p)); + if (niceval > retval) + retval = niceval; + } + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + niceval = nice_to_rlimit(task_nice(p)); + if (niceval > retval) + retval = niceval; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + uid = make_kuid(cred->user_ns, who); + user = cred->user; + if (!who) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid)) { + user = find_user(uid); + if (!user) + goto out_unlock; /* No processes for this user */ + } + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - uid = make_kuid(cred->user_ns, who); - user = cred->user; - if (!who) - uid = cred->uid; - else if (!uid_eq(uid, cred->uid) && - !(user = find_user(uid))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } - } while_each_thread(g, p); - if (!uid_eq(uid, cred->uid)) - free_uid(user); /* for find_user() */ - break; + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) + free_uid(user); /* for find_user() */ + break; } out_unlock: read_unlock(&tasklist_lock); @@ -308,266 +307,6 @@ return retval; } -/** - * emergency_restart - reboot the system - * - * Without shutting down any hardware or taking any locks - * reboot the system. This is called when we know we are in - * trouble so this is our best effort to reboot. This is - * safe to call in interrupt context. - */ -void emergency_restart(void) -{ - kmsg_dump(KMSG_DUMP_EMERG); - machine_emergency_restart(); -} -EXPORT_SYMBOL_GPL(emergency_restart); - -void kernel_restart_prepare(char *cmd) -{ - blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); - system_state = SYSTEM_RESTART; - usermodehelper_disable(); - device_shutdown(); -} - -/** - * register_reboot_notifier - Register function to be called at reboot time - * @nb: Info about notifier function to be called - * - * Registers a function with the list of functions - * to be called at reboot time. - * - * Currently always returns zero, as blocking_notifier_chain_register() - * always returns zero. - */ -int register_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(register_reboot_notifier); - -/** - * unregister_reboot_notifier - Unregister previously registered reboot notifier - * @nb: Hook to be unregistered - * - * Unregisters a previously registered reboot - * notifier function. - * - * Returns zero on success, or %-ENOENT on failure. - */ -int unregister_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(unregister_reboot_notifier); - -/* Add backwards compatibility for stable trees. */ -#ifndef PF_NO_SETAFFINITY -#define PF_NO_SETAFFINITY PF_THREAD_BOUND -#endif - -static void migrate_to_reboot_cpu(void) -{ - /* The boot cpu is always logical cpu 0 */ - int cpu = 0; - - cpu_hotplug_disable(); - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(cpu)) - cpu = cpumask_first(cpu_online_mask); - - /* Prevent races with other tasks migrating this task */ - current->flags |= PF_NO_SETAFFINITY; - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -} - -/** - * kernel_restart - reboot the system - * @cmd: pointer to buffer containing command to execute for restart - * or %NULL - * - * Shutdown everything and perform a clean reboot. - * This is not safe to call in interrupt context. - */ -void kernel_restart(char *cmd) -{ - kernel_restart_prepare(cmd); - migrate_to_reboot_cpu(); - syscore_shutdown(); - if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); - else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); - machine_restart(cmd); -} -EXPORT_SYMBOL_GPL(kernel_restart); - -static void kernel_shutdown_prepare(enum system_states state) -{ - blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); - system_state = state; - usermodehelper_disable(); - device_shutdown(); -} -/** - * kernel_halt - halt the system - * - * Shutdown everything and perform a clean system halt. - */ -void kernel_halt(void) -{ - kernel_shutdown_prepare(SYSTEM_HALT); - migrate_to_reboot_cpu(); - syscore_shutdown(); - printk(KERN_EMERG "System halted.\n"); - kmsg_dump(KMSG_DUMP_HALT); - machine_halt(); -} - -EXPORT_SYMBOL_GPL(kernel_halt); - -/** - * kernel_power_off - power_off the system - * - * Shutdown everything and perform a clean system power_off. - */ -void kernel_power_off(void) -{ - kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); - migrate_to_reboot_cpu(); - syscore_shutdown(); - printk(KERN_EMERG "Power down.\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); - machine_power_off(); -} -EXPORT_SYMBOL_GPL(kernel_power_off); - -static DEFINE_MUTEX(reboot_mutex); - -/* - * Reboot system call: for obvious reasons only root may call it, - * and even root needs to set up some magic numbers in the registers - * so that some mistake won't make this reboot the whole machine. - * You can also set the meaning of the ctrl-alt-del-key here. - * - * reboot doesn't sync: do that yourself before calling this. - */ -SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, - void __user *, arg) -{ - struct pid_namespace *pid_ns = task_active_pid_ns(current); - char buffer[256]; - int ret = 0; - - /* We only trust the superuser with rebooting the system. */ - if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) - return -EPERM; - - /* For safety, we require "magic" arguments. */ - if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && - magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) - return -EINVAL; - - /* - * If pid namespaces are enabled and the current task is in a child - * pid_namespace, the command is handled by reboot_pid_ns() which will - * call do_exit(). - */ - ret = reboot_pid_ns(pid_ns, cmd); - if (ret) - return ret; - - /* Instead of trying to make the power_off code look like - * halt when pm_power_off is not set do it the easy way. - */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) - cmd = LINUX_REBOOT_CMD_HALT; - - mutex_lock(&reboot_mutex); - switch (cmd) { - case LINUX_REBOOT_CMD_RESTART: - kernel_restart(NULL); - break; - - case LINUX_REBOOT_CMD_CAD_ON: - C_A_D = 1; - break; - - case LINUX_REBOOT_CMD_CAD_OFF: - C_A_D = 0; - break; - - case LINUX_REBOOT_CMD_HALT: - kernel_halt(); - do_exit(0); - panic("cannot halt"); - - case LINUX_REBOOT_CMD_POWER_OFF: - kernel_power_off(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - ret = -EFAULT; - break; - } - buffer[sizeof(buffer) - 1] = '\0'; - - kernel_restart(buffer); - break; - -#ifdef CONFIG_KEXEC - case LINUX_REBOOT_CMD_KEXEC: - ret = kernel_kexec(); - break; -#endif - -#ifdef CONFIG_HIBERNATION - case LINUX_REBOOT_CMD_SW_SUSPEND: - ret = hibernate(); - break; -#endif - - default: - ret = -EINVAL; - break; - } - mutex_unlock(&reboot_mutex); - return ret; -} - -static void deferred_cad(struct work_struct *dummy) -{ - kernel_restart(NULL); -} - -/* - * This function gets called by ctrl-alt-del - ie the keyboard interrupt. - * As it's called within an interrupt, it may NOT sync: the only choice - * is whether to reboot at once, or just ignore the ctrl-alt-del. - */ -void ctrl_alt_del(void) -{ - static DECLARE_WORK(cad_work, deferred_cad); - - if (C_A_D) - schedule_work(&cad_work); - else - kill_cad_pid(SIGINT, 1); -} - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) @@ -581,11 +320,12 @@ * * The general idea is that a program which uses just setregid() will be * 100% compatible with BSD. A program which uses just setgid() will be - * 100% compatible with POSIX with saved IDs. + * 100% compatible with POSIX with saved IDs. * * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ +#ifdef CONFIG_MULTIUSER SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { struct user_namespace *ns = current_user_ns(); @@ -611,7 +351,7 @@ if (rgid != (gid_t) -1) { if (gid_eq(old->gid, krgid) || gid_eq(old->egid, krgid) || - nsown_capable(CAP_SETGID)) + ns_capable(old->user_ns, CAP_SETGID)) new->gid = krgid; else goto error; @@ -620,7 +360,7 @@ if (gid_eq(old->gid, kegid) || gid_eq(old->egid, kegid) || gid_eq(old->sgid, kegid) || - nsown_capable(CAP_SETGID)) + ns_capable(old->user_ns, CAP_SETGID)) new->egid = kegid; else goto error; @@ -639,7 +379,7 @@ } /* - * setgid() is implemented like SysV w/ SAVED_IDS + * setgid() is implemented like SysV w/ SAVED_IDS * * SMP: Same implicit races as above. */ @@ -661,7 +401,7 @@ old = current_cred(); retval = -EPERM; - if (nsown_capable(CAP_SETGID)) + if (ns_capable(old->user_ns, CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = kgid; else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) new->egid = new->fsgid = kgid; @@ -717,7 +457,7 @@ * * The general idea is that a program which uses just setreuid() will be * 100% compatible with BSD. A program which uses just setuid() will be - * 100% compatible with POSIX with saved IDs. + * 100% compatible with POSIX with saved IDs. */ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { @@ -745,7 +485,7 @@ new->uid = kruid; if (!uid_eq(old->uid, kruid) && !uid_eq(old->euid, kruid) && - !nsown_capable(CAP_SETUID)) + !ns_capable(old->user_ns, CAP_SETUID)) goto error; } @@ -754,7 +494,7 @@ if (!uid_eq(old->uid, keuid) && !uid_eq(old->euid, keuid) && !uid_eq(old->suid, keuid) && - !nsown_capable(CAP_SETUID)) + !ns_capable(old->user_ns, CAP_SETUID)) goto error; } @@ -778,17 +518,17 @@ abort_creds(new); return retval; } - + /* - * setuid() is implemented like SysV with SAVED_IDS - * + * setuid() is implemented like SysV with SAVED_IDS + * * Note that SAVED_ID's is deficient in that a setuid root program - * like sendmail, for example, cannot set its uid to be a normal + * like sendmail, for example, cannot set its uid to be a normal * user and then switch back, because if you're root, setuid() sets * the saved uid too. If you don't like this, blame the bright people * in the POSIX committee and/or USG. Note that the BSD-style setreuid() * will allow a root program to temporarily drop privileges and be able to - * regain them by swapping the real and effective uid. + * regain them by swapping the real and effective uid. */ SYSCALL_DEFINE1(setuid, uid_t, uid) { @@ -808,7 +548,7 @@ old = current_cred(); retval = -EPERM; - if (nsown_capable(CAP_SETUID)) { + if (ns_capable(old->user_ns, CAP_SETUID)) { new->suid = new->uid = kuid; if (!uid_eq(kuid, old->uid)) { retval = set_user(new); @@ -865,7 +605,7 @@ old = current_cred(); retval = -EPERM; - if (!nsown_capable(CAP_SETUID)) { + if (!ns_capable(old->user_ns, CAP_SETUID)) { if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) goto error; @@ -912,10 +652,12 @@ euid = from_kuid_munged(cred->user_ns, cred->euid); suid = from_kuid_munged(cred->user_ns, cred->suid); - if (!(retval = put_user(ruid, ruidp)) && - !(retval = put_user(euid, euidp))) - retval = put_user(suid, suidp); - + retval = put_user(ruid, ruidp); + if (!retval) { + retval = put_user(euid, euidp); + if (!retval) + return put_user(suid, suidp); + } return retval; } @@ -947,7 +689,7 @@ old = current_cred(); retval = -EPERM; - if (!nsown_capable(CAP_SETGID)) { + if (!ns_capable(old->user_ns, CAP_SETGID)) { if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) goto error; @@ -984,9 +726,12 @@ egid = from_kgid_munged(cred->user_ns, cred->egid); sgid = from_kgid_munged(cred->user_ns, cred->sgid); - if (!(retval = put_user(rgid, rgidp)) && - !(retval = put_user(egid, egidp))) - retval = put_user(sgid, sgidp); + retval = put_user(rgid, rgidp); + if (!retval) { + retval = put_user(egid, egidp); + if (!retval) + retval = put_user(sgid, sgidp); + } return retval; } @@ -1018,7 +763,7 @@ if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || - nsown_capable(CAP_SETUID)) { + ns_capable(old->user_ns, CAP_SETUID)) { if (!uid_eq(kuid, old->fsuid)) { new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -1057,7 +802,7 @@ if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || - nsown_capable(CAP_SETGID)) { + ns_capable(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; goto change_okay; @@ -1071,6 +816,7 @@ commit_creds(new); return old_fsgid; } +#endif /* CONFIG_MULTIUSER */ /** * sys_getpid - return the thread group id of the current process @@ -1137,11 +883,9 @@ { cputime_t tgutime, tgstime, cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); tms->tms_utime = cputime_to_clock_t(tgutime); tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); @@ -1170,8 +914,7 @@ * only important on a multi-user system anyway, to make sure one user * can't send a signal to a process owned by another. -TYT, 12/12/91 * - * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. - * LBT 04.03.94 + * !PF_FORKNOEXEC check to conform completely to POSIX. */ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { @@ -1207,7 +950,7 @@ if (task_session(p) != task_session(group_leader)) goto out; err = -EACCES; - if (p->did_exec) + if (!(p->flags & PF_FORKNOEXEC)) goto out; } else { err = -ESRCH; @@ -1309,6 +1052,17 @@ return retval; } +static void set_special_pids(struct pid *pid) +{ + struct task_struct *curr = current->group_leader; + + if (task_session(curr) != pid) + change_pid(curr, PIDTYPE_SID, pid); + + if (task_pgrp(curr) != pid) + change_pid(curr, PIDTYPE_PGID, pid); +} + SYSCALL_DEFINE0(setsid) { struct task_struct *group_leader = current->group_leader; @@ -1328,7 +1082,7 @@ goto out; group_leader->signal->leader = 1; - __set_special_pids(sid); + set_special_pids(sid); proc_clear_tty(group_leader); @@ -1356,6 +1110,7 @@ /* * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60. */ static int override_release(char __user *release, size_t len) { @@ -1375,7 +1130,7 @@ break; rest++; } - v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60; copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); @@ -1549,7 +1304,6 @@ /* * Back compatibility for getrlimit. Needed for some apps. */ - SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { @@ -1564,7 +1318,7 @@ x.rlim_cur = 0x7FFFFFFF; if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; - return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; + return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; } #endif @@ -1792,7 +1546,7 @@ cputime_t tgutime, tgstime, utime, stime; unsigned long maxrss = 0; - memset((char *) r, 0, sizeof *r); + memset((char *)r, 0, sizeof (*r)); utime = stime = 0; if (who == RUSAGE_THREAD) { @@ -1806,42 +1560,41 @@ return; switch (who) { - case RUSAGE_BOTH: - case RUSAGE_CHILDREN: - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - r->ru_inblock = p->signal->cinblock; - r->ru_oublock = p->signal->coublock; - maxrss = p->signal->cmaxrss; - - if (who == RUSAGE_CHILDREN) - break; + case RUSAGE_BOTH: + case RUSAGE_CHILDREN: + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + r->ru_inblock = p->signal->cinblock; + r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; - case RUSAGE_SELF: - thread_group_cputime_adjusted(p, &tgutime, &tgstime); - utime += tgutime; - stime += tgstime; - r->ru_nvcsw += p->signal->nvcsw; - r->ru_nivcsw += p->signal->nivcsw; - r->ru_minflt += p->signal->min_flt; - r->ru_majflt += p->signal->maj_flt; - r->ru_inblock += p->signal->inblock; - r->ru_oublock += p->signal->oublock; - if (maxrss < p->signal->maxrss) - maxrss = p->signal->maxrss; - t = p; - do { - accumulate_thread_rusage(t, r); - t = next_thread(t); - } while (t != p); + if (who == RUSAGE_CHILDREN) break; - default: - BUG(); + case RUSAGE_SELF: + thread_group_cputime_adjusted(p, &tgutime, &tgstime); + utime += tgutime; + stime += tgstime; + r->ru_nvcsw += p->signal->nvcsw; + r->ru_nivcsw += p->signal->nivcsw; + r->ru_minflt += p->signal->min_flt; + r->ru_majflt += p->signal->maj_flt; + r->ru_inblock += p->signal->inblock; + r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; + t = p; + do { + accumulate_thread_rusage(t, r); + } while_each_thread(p, t); + break; + + default: + BUG(); } unlock_task_sighand(p, &flags); @@ -1851,6 +1604,7 @@ if (who != RUSAGE_CHILDREN) { struct mm_struct *mm = get_task_mm(p); + if (mm) { setmax_mm_hiwater_rss(&maxrss, mm); mmput(mm); @@ -1862,6 +1616,7 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) { struct rusage r; + k_getrusage(p, who, &r); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } @@ -1897,6 +1652,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; + struct file *old_exe, *exe_file; struct inode *inode; int err; @@ -1912,28 +1668,32 @@ * overall picture. */ err = -EACCES; - if (!S_ISREG(inode->i_mode) || - exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) + if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) goto exit; err = inode_permission(inode, MAY_EXEC); if (err) goto exit; - down_write(&mm->mmap_sem); - /* * Forbid mm->exe_file change if old file still mapped. */ + exe_file = get_mm_exe_file(mm); err = -EBUSY; - if (mm->exe_file) { + if (exe_file) { struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) - if (vma->vm_file && - path_equal(&vma->vm_file->f_path, - &mm->exe_file->f_path)) - goto exit_unlock; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (path_equal(&vma->vm_file->f_path, + &exe_file->f_path)) + goto exit_err; + } + + up_read(&mm->mmap_sem); + fput(exe_file); } /* @@ -1944,81 +1704,326 @@ */ err = -EPERM; if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) - goto exit_unlock; + goto exit; err = 0; - set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ -exit_unlock: - up_write(&mm->mmap_sem); - + /* set the new file, lockless */ + get_file(exe.file); + old_exe = xchg(&mm->exe_file, exe.file); + if (old_exe) + fput(old_exe); exit: fdput(exe); return err; +exit_err: + up_read(&mm->mmap_sem); + fput(exe_file); + goto exit; +} + +/* + * WARNING: we don't require any capability here so be very careful + * in what is allowed for modification from userspace. + */ +static int validate_prctl_map(struct prctl_mm_map *prctl_map) +{ + unsigned long mmap_max_addr = TASK_SIZE; + struct mm_struct *mm = current->mm; + int error = -EINVAL, i; + + static const unsigned char offsets[] = { + offsetof(struct prctl_mm_map, start_code), + offsetof(struct prctl_mm_map, end_code), + offsetof(struct prctl_mm_map, start_data), + offsetof(struct prctl_mm_map, end_data), + offsetof(struct prctl_mm_map, start_brk), + offsetof(struct prctl_mm_map, brk), + offsetof(struct prctl_mm_map, start_stack), + offsetof(struct prctl_mm_map, arg_start), + offsetof(struct prctl_mm_map, arg_end), + offsetof(struct prctl_mm_map, env_start), + offsetof(struct prctl_mm_map, env_end), + }; + + /* + * Make sure the members are not somewhere outside + * of allowed address space. + */ + for (i = 0; i < ARRAY_SIZE(offsets); i++) { + u64 val = *(u64 *)((char *)prctl_map + offsets[i]); + + if ((unsigned long)val >= mmap_max_addr || + (unsigned long)val < mmap_min_addr) + goto out; + } + + /* + * Make sure the pairs are ordered. + */ +#define __prctl_check_order(__m1, __op, __m2) \ + ((unsigned long)prctl_map->__m1 __op \ + (unsigned long)prctl_map->__m2) ? 0 : -EINVAL + error = __prctl_check_order(start_code, <, end_code); + error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_brk, <=, brk); + error |= __prctl_check_order(arg_start, <=, arg_end); + error |= __prctl_check_order(env_start, <=, env_end); + if (error) + goto out; +#undef __prctl_check_order + + error = -EINVAL; + + /* + * @brk should be after @end_data in traditional maps. + */ + if (prctl_map->start_brk <= prctl_map->end_data || + prctl_map->brk <= prctl_map->end_data) + goto out; + + /* + * Neither we should allow to override limits if they set. + */ + if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, + prctl_map->start_brk, prctl_map->end_data, + prctl_map->start_data)) + goto out; + + /* + * Someone is trying to cheat the auxv vector. + */ + if (prctl_map->auxv_size) { + if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) + goto out; + } + + /* + * Finally, make sure the caller has the rights to + * change /proc/pid/exe link: only local root should + * be allowed to. + */ + if (prctl_map->exe_fd != (u32)-1) { + struct user_namespace *ns = current_user_ns(); + const struct cred *cred = current_cred(); + + if (!uid_eq(cred->uid, make_kuid(ns, 0)) || + !gid_eq(cred->gid, make_kgid(ns, 0))) + goto out; + } + + error = 0; +out: + return error; +} + +#ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) +{ + struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; + unsigned long user_auxv[AT_VECTOR_SIZE]; + struct mm_struct *mm = current->mm; + int error; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); + + if (opt == PR_SET_MM_MAP_SIZE) + return put_user((unsigned int)sizeof(prctl_map), + (unsigned int __user *)addr); + + if (data_size != sizeof(prctl_map)) + return -EINVAL; + + if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) + return -EFAULT; + + error = validate_prctl_map(&prctl_map); + if (error) + return error; + + if (prctl_map.auxv_size) { + memset(user_auxv, 0, sizeof(user_auxv)); + if (copy_from_user(user_auxv, + (const void __user *)prctl_map.auxv, + prctl_map.auxv_size)) + return -EFAULT; + + /* Last entry must be AT_NULL as specification requires */ + user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; + user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; + } + + if (prctl_map.exe_fd != (u32)-1) { + error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); + if (error) + return error; + } + + down_write(&mm->mmap_sem); + + /* + * We don't validate if these members are pointing to + * real present VMAs because application may have correspond + * VMAs already unmapped and kernel uses these members for statistics + * output in procfs mostly, except + * + * - @start_brk/@brk which are used in do_brk but kernel lookups + * for VMAs when updating these memvers so anything wrong written + * here cause kernel to swear at userspace program but won't lead + * to any problem in kernel itself + */ + + mm->start_code = prctl_map.start_code; + mm->end_code = prctl_map.end_code; + mm->start_data = prctl_map.start_data; + mm->end_data = prctl_map.end_data; + mm->start_brk = prctl_map.start_brk; + mm->brk = prctl_map.brk; + mm->start_stack = prctl_map.start_stack; + mm->arg_start = prctl_map.arg_start; + mm->arg_end = prctl_map.arg_end; + mm->env_start = prctl_map.env_start; + mm->env_end = prctl_map.env_end; + + /* + * Note this update of @saved_auxv is lockless thus + * if someone reads this member in procfs while we're + * updating -- it may get partly updated results. It's + * known and acceptable trade off: we leave it as is to + * not introduce additional locks here making the kernel + * more complex. + */ + if (prctl_map.auxv_size) + memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); + + up_write(&mm->mmap_sem); + return 0; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ + +static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + /* + * This doesn't move the auxiliary vector itself since it's pinned to + * mm_struct, but it permits filling the vector with new values. It's + * up to the caller to provide sane values here, otherwise userspace + * tools which use this vector might be unhappy. + */ + unsigned long user_auxv[AT_VECTOR_SIZE]; + + if (len > sizeof(user_auxv)) + return -EINVAL; + + if (copy_from_user(user_auxv, (const void __user *)addr, len)) + return -EFAULT; + + /* Make sure the last entry is always AT_NULL */ + user_auxv[AT_VECTOR_SIZE - 2] = 0; + user_auxv[AT_VECTOR_SIZE - 1] = 0; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + + task_lock(current); + memcpy(mm->saved_auxv, user_auxv, len); + task_unlock(current); + + return 0; } static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { - unsigned long rlim = rlimit(RLIMIT_DATA); struct mm_struct *mm = current->mm; + struct prctl_mm_map prctl_map; struct vm_area_struct *vma; int error; - if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) + if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && + opt != PR_SET_MM_MAP && + opt != PR_SET_MM_MAP_SIZE))) return -EINVAL; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) + return prctl_set_mm_map(opt, (const void __user *)addr, arg4); +#endif + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsigned int)addr); + if (opt == PR_SET_MM_AUXV) + return prctl_set_auxv(mm, addr, arg4); + if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; error = -EINVAL; - down_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); vma = find_vma(mm, addr); + prctl_map.start_code = mm->start_code; + prctl_map.end_code = mm->end_code; + prctl_map.start_data = mm->start_data; + prctl_map.end_data = mm->end_data; + prctl_map.start_brk = mm->start_brk; + prctl_map.brk = mm->brk; + prctl_map.start_stack = mm->start_stack; + prctl_map.arg_start = mm->arg_start; + prctl_map.arg_end = mm->arg_end; + prctl_map.env_start = mm->env_start; + prctl_map.env_end = mm->env_end; + prctl_map.auxv = NULL; + prctl_map.auxv_size = 0; + prctl_map.exe_fd = -1; + switch (opt) { case PR_SET_MM_START_CODE: - mm->start_code = addr; + prctl_map.start_code = addr; break; case PR_SET_MM_END_CODE: - mm->end_code = addr; + prctl_map.end_code = addr; break; case PR_SET_MM_START_DATA: - mm->start_data = addr; + prctl_map.start_data = addr; break; case PR_SET_MM_END_DATA: - mm->end_data = addr; + prctl_map.end_data = addr; + break; + case PR_SET_MM_START_STACK: + prctl_map.start_stack = addr; break; - case PR_SET_MM_START_BRK: - if (addr <= mm->end_data) - goto out; - - if (rlim < RLIM_INFINITY && - (mm->brk - addr) + - (mm->end_data - mm->start_data) > rlim) - goto out; - - mm->start_brk = addr; + prctl_map.start_brk = addr; break; - case PR_SET_MM_BRK: - if (addr <= mm->end_data) - goto out; - - if (rlim < RLIM_INFINITY && - (addr - mm->start_brk) + - (mm->end_data - mm->start_data) > rlim) - goto out; - - mm->brk = addr; + prctl_map.brk = addr; + break; + case PR_SET_MM_ARG_START: + prctl_map.arg_start = addr; + break; + case PR_SET_MM_ARG_END: + prctl_map.arg_end = addr; + break; + case PR_SET_MM_ENV_START: + prctl_map.env_start = addr; break; + case PR_SET_MM_ENV_END: + prctl_map.env_end = addr; + break; + default: + goto out; + } + error = validate_prctl_map(&prctl_map); + if (error) + goto out; + + switch (opt) { /* * If command line arguments and environment * are placed somewhere else on stack, we can @@ -2035,55 +2040,23 @@ error = -EFAULT; goto out; } - if (opt == PR_SET_MM_START_STACK) - mm->start_stack = addr; - else if (opt == PR_SET_MM_ARG_START) - mm->arg_start = addr; - else if (opt == PR_SET_MM_ARG_END) - mm->arg_end = addr; - else if (opt == PR_SET_MM_ENV_START) - mm->env_start = addr; - else if (opt == PR_SET_MM_ENV_END) - mm->env_end = addr; - break; - - /* - * This doesn't move auxiliary vector itself - * since it's pinned to mm_struct, but allow - * to fill vector with new values. It's up - * to a caller to provide sane values here - * otherwise user space tools which use this - * vector might be unhappy. - */ - case PR_SET_MM_AUXV: { - unsigned long user_auxv[AT_VECTOR_SIZE]; - - if (arg4 > sizeof(user_auxv)) - goto out; - up_read(&mm->mmap_sem); - - if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) - return -EFAULT; - - /* Make sure the last entry is always AT_NULL */ - user_auxv[AT_VECTOR_SIZE - 2] = 0; - user_auxv[AT_VECTOR_SIZE - 1] = 0; - - BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); - - task_lock(current); - memcpy(mm->saved_auxv, user_auxv, arg4); - task_unlock(current); - - return 0; - } - default: - goto out; } + mm->start_code = prctl_map.start_code; + mm->end_code = prctl_map.end_code; + mm->start_data = prctl_map.start_data; + mm->end_data = prctl_map.end_data; + mm->start_brk = prctl_map.start_brk; + mm->brk = prctl_map.brk; + mm->start_stack = prctl_map.start_stack; + mm->arg_start = prctl_map.arg_start; + mm->arg_end = prctl_map.arg_end; + mm->env_start = prctl_map.env_start; + mm->env_end = prctl_map.env_end; + error = 0; out: - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); return error; } @@ -2256,12 +2229,43 @@ if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; - current->no_new_privs = 1; + task_set_no_new_privs(current); break; case PR_GET_NO_NEW_PRIVS: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - return current->no_new_privs ? 1 : 0; + return task_no_new_privs(current) ? 1 : 0; + case PR_GET_THP_DISABLE: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + break; + case PR_SET_THP_DISABLE: + if (arg3 || arg4 || arg5) + return -EINVAL; + down_write(&me->mm->mmap_sem); + if (arg2) + me->mm->def_flags |= VM_NOHUGEPAGE; + else + me->mm->def_flags &= ~VM_NOHUGEPAGE; + up_write(&me->mm->mmap_sem); + break; + case PR_MPX_ENABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = MPX_ENABLE_MANAGEMENT(); + break; + case PR_MPX_DISABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = MPX_DISABLE_MANAGEMENT(); + break; + case PR_SET_FP_MODE: + error = SET_FP_MODE(me, arg2); + break; + case PR_GET_FP_MODE: + error = GET_FP_MODE(me); + break; default: error = -EINVAL; break; @@ -2274,6 +2278,7 @@ { int err = 0; int cpu = raw_smp_processor_id(); + if (cpup) err |= put_user(cpu, cpup); if (nodep) @@ -2281,68 +2286,6 @@ return err ? -EFAULT : 0; } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; - -static int __orderly_poweroff(bool force) -{ - char **argv; - static char *envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL - }; - int ret; - - argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); - if (argv) { - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - argv_free(argv); - } else { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - ret = -ENOMEM; - } - - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - /* - * I guess this should try to kick off some daemon to sync and - * poweroff asap. Or not even bother syncing if we're doing an - * emergency shutdown? - */ - emergency_sync(); - kernel_power_off(); - } - - return ret; -} - -static bool poweroff_force; - -static void poweroff_work_func(struct work_struct *work) -{ - __orderly_poweroff(poweroff_force); -} - -static DECLARE_WORK(poweroff_work, poweroff_work_func); - -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) -{ - if (force) /* do not override the pending "true" */ - poweroff_force = true; - schedule_work(&poweroff_work); - return 0; -} -EXPORT_SYMBOL_GPL(orderly_poweroff); - /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill @@ -2355,8 +2298,7 @@ memset(info, 0, sizeof(struct sysinfo)); - ktime_get_ts(&tp); - monotonic_to_bootbased(&tp); + get_monotonic_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); @@ -2449,7 +2391,7 @@ /* Check to see if any memory value is too large for 32-bit and scale * down if needed */ - if ((s.totalram >> 32) || (s.totalswap >> 32)) { + if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { int bitcount = 0; while (s.mem_unit < PAGE_SIZE) {