kernel/sys.c

   1 /*
   2  *  linux/kernel/sys.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 #include <linux/export.h>
   8 #include <linux/mm.h>
   9 #include <linux/utsname.h>
  10 #include <linux/mman.h>
  11 #include <linux/reboot.h>
  12 #include <linux/prctl.h>
  13 #include <linux/highuid.h>
  14 #include <linux/fs.h>
  15 #include <linux/kmod.h>
  16 #include <linux/perf_event.h>
  17 #include <linux/resource.h>
  18 #include <linux/kernel.h>
  19 #include <linux/workqueue.h>
  20 #include <linux/capability.h>
  21 #include <linux/device.h>
  22 #include <linux/key.h>
  23 #include <linux/times.h>
  24 #include <linux/posix-timers.h>
  25 #include <linux/security.h>
  26 #include <linux/dcookies.h>
  27 #include <linux/suspend.h>
  28 #include <linux/tty.h>
  29 #include <linux/signal.h>
  30 #include <linux/cn_proc.h>
  31 #include <linux/getcpu.h>
  32 #include <linux/task_io_accounting_ops.h>
  33 #include <linux/seccomp.h>
  34 #include <linux/cpu.h>
  35 #include <linux/personality.h>
  36 #include <linux/ptrace.h>
  37 #include <linux/fs_struct.h>
  38 #include <linux/file.h>
  39 #include <linux/mount.h>
  40 #include <linux/gfp.h>
  41 #include <linux/syscore_ops.h>
  42 #include <linux/version.h>
  43 #include <linux/ctype.h>
  44
  45 #include <linux/compat.h>
  46 #include <linux/syscalls.h>
  47 #include <linux/kprobes.h>
  48 #include <linux/user_namespace.h>
  49 #include <linux/binfmts.h>
  50
  51 #include <linux/sched.h>
  52 #include <linux/rcupdate.h>
  53 #include <linux/uidgid.h>
  54 #include <linux/cred.h>
  55
  56 #include <linux/kmsg_dump.h>
  57 /* Move somewhere else to avoid recompiling? */
  58 #include <generated/utsrelease.h>
  59
  60 #include <asm/uaccess.h>
  61 #include <asm/io.h>
  62 #include <asm/unistd.h>
  63
  64 #ifndef SET_UNALIGN_CTL
  65 # define SET_UNALIGN_CTL(a, b)  (-EINVAL)
  66 #endif
  67 #ifndef GET_UNALIGN_CTL
  68 # define GET_UNALIGN_CTL(a, b)  (-EINVAL)
  69 #endif
  70 #ifndef SET_FPEMU_CTL
  71 # define SET_FPEMU_CTL(a, b)    (-EINVAL)
  72 #endif
  73 #ifndef GET_FPEMU_CTL
  74 # define GET_FPEMU_CTL(a, b)    (-EINVAL)
  75 #endif
  76 #ifndef SET_FPEXC_CTL
  77 # define SET_FPEXC_CTL(a, b)    (-EINVAL)
  78 #endif
  79 #ifndef GET_FPEXC_CTL
  80 # define GET_FPEXC_CTL(a, b)    (-EINVAL)
  81 #endif
  82 #ifndef GET_ENDIAN
  83 # define GET_ENDIAN(a, b)       (-EINVAL)
  84 #endif
  85 #ifndef SET_ENDIAN
  86 # define SET_ENDIAN(a, b)       (-EINVAL)
  87 #endif
  88 #ifndef GET_TSC_CTL
  89 # define GET_TSC_CTL(a)         (-EINVAL)
  90 #endif
  91 #ifndef SET_TSC_CTL
  92 # define SET_TSC_CTL(a)         (-EINVAL)
  93 #endif
  94 #ifndef MPX_ENABLE_MANAGEMENT
  95 # define MPX_ENABLE_MANAGEMENT(a)       (-EINVAL)
  96 #endif
  97 #ifndef MPX_DISABLE_MANAGEMENT
  98 # define MPX_DISABLE_MANAGEMENT(a)      (-EINVAL)
  99 #endif
 100
 101 /*
 102  * this is where the system-wide overflow UID and GID are defined, for
 103  * architectures that now have 32-bit UID/GID but didn't in the past
 104  */
 105
 106 int overflowuid = DEFAULT_OVERFLOWUID;
 107 int overflowgid = DEFAULT_OVERFLOWGID;
 108
 109 EXPORT_SYMBOL(overflowuid);
 110 EXPORT_SYMBOL(overflowgid);
 111
 112 /*
 113  * the same as above, but for filesystems which can only store a 16-bit
 114  * UID and GID. as such, this is needed on all architectures
 115  */
 116
 117 int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
 118 int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
 119
 120 EXPORT_SYMBOL(fs_overflowuid);
 121 EXPORT_SYMBOL(fs_overflowgid);
 122
 123 /*
 124  * Returns true if current's euid is same as p's uid or euid,
 125  * or has CAP_SYS_NICE to p's user_ns.
 126  *
 127  * Called with rcu_read_lock, creds are safe
 128  */
 129 static bool set_one_prio_perm(struct task_struct *p)
 130 {
 131         const struct cred *cred = current_cred(), *pcred = __task_cred(p);
 132
 133         if (uid_eq(pcred->uid,  cred->euid) ||
 134             uid_eq(pcred->euid, cred->euid))
 135                 return true;
 136         if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
 137                 return true;
 138         return false;
 139 }
 140
 141 /*
 142  * set the priority of a task
 143  * - the caller must hold the RCU read lock
 144  */
 145 static int set_one_prio(struct task_struct *p, int niceval, int error)
 146 {
 147         int no_nice;
 148
 149         if (!set_one_prio_perm(p)) {
 150                 error = -EPERM;
 151                 goto out;
 152         }
 153         if (niceval < task_nice(p) && !can_nice(p, niceval)) {
 154                 error = -EACCES;
 155                 goto out;
 156         }
 157         no_nice = security_task_setnice(p, niceval);
 158         if (no_nice) {
 159                 error = no_nice;
 160                 goto out;
 161         }
 162         if (error == -ESRCH)
 163                 error = 0;
 164         set_user_nice(p, niceval);
 165 out:
 166         return error;
 167 }
 168
 169 SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
 170 {
 171         struct task_struct *g, *p;
 172         struct user_struct *user;
 173         const struct cred *cred = current_cred();
 174         int error = -EINVAL;
 175         struct pid *pgrp;
 176         kuid_t uid;
 177
 178         if (which > PRIO_USER || which < PRIO_PROCESS)
 179                 goto out;
 180
 181         /* normalize: avoid signed division (rounding problems) */
 182         error = -ESRCH;
 183         if (niceval < MIN_NICE)
 184                 niceval = MIN_NICE;
 185         if (niceval > MAX_NICE)
 186                 niceval = MAX_NICE;
 187
 188         rcu_read_lock();
 189         read_lock(&tasklist_lock);
 190         switch (which) {
 191         case PRIO_PROCESS:
 192                 if (who)
 193                         p = find_task_by_vpid(who);
 194                 else
 195                         p = current;
 196                 if (p)
 197                         error = set_one_prio(p, niceval, error);
 198                 break;
 199         case PRIO_PGRP:
 200                 if (who)
 201                         pgrp = find_vpid(who);
 202                 else
 203                         pgrp = task_pgrp(current);
 204                 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
 205                         error = set_one_prio(p, niceval, error);
 206                 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 207                 break;
 208         case PRIO_USER:
 209                 uid = make_kuid(cred->user_ns, who);
 210                 user = cred->user;
 211                 if (!who)
 212                         uid = cred->uid;
 213                 else if (!uid_eq(uid, cred->uid)) {
 214                         user = find_user(uid);
 215                         if (!user)
 216                                 goto out_unlock;        /* No processes for this user */
 217                 }
 218                 do_each_thread(g, p) {
 219                         if (uid_eq(task_uid(p), uid))
 220                                 error = set_one_prio(p, niceval, error);
 221                 } while_each_thread(g, p);
 222                 if (!uid_eq(uid, cred->uid))
 223                         free_uid(user);         /* For find_user() */
 224                 break;
 225         }
 226 out_unlock:
 227         read_unlock(&tasklist_lock);
 228         rcu_read_unlock();
 229 out:
 230         return error;
 231 }
 232
 233 /*
 234  * Ugh. To avoid negative return values, "getpriority()" will
 235  * not return the normal nice-value, but a negated value that
 236  * has been offset by 20 (ie it returns 40..1 instead of -20..19)
 237  * to stay compatible.
 238  */
 239 SYSCALL_DEFINE2(getpriority, int, which, int, who)
 240 {
 241         struct task_struct *g, *p;
 242         struct user_struct *user;
 243         const struct cred *cred = current_cred();
 244         long niceval, retval = -ESRCH;
 245         struct pid *pgrp;
 246         kuid_t uid;
 247
 248         if (which > PRIO_USER || which < PRIO_PROCESS)
 249                 return -EINVAL;
 250
 251         rcu_read_lock();
 252         read_lock(&tasklist_lock);
 253         switch (which) {
 254         case PRIO_PROCESS:
 255                 if (who)
 256                         p = find_task_by_vpid(who);
 257                 else
 258                         p = current;
 259                 if (p) {
 260                         niceval = nice_to_rlimit(task_nice(p));
 261                         if (niceval > retval)
 262                                 retval = niceval;
 263                 }
 264                 break;
 265         case PRIO_PGRP:
 266                 if (who)
 267                         pgrp = find_vpid(who);
 268                 else
 269                         pgrp = task_pgrp(current);
 270                 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
 271                         niceval = nice_to_rlimit(task_nice(p));
 272                         if (niceval > retval)
 273                                 retval = niceval;
 274                 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 275                 break;
 276         case PRIO_USER:
 277                 uid = make_kuid(cred->user_ns, who);
 278                 user = cred->user;
 279                 if (!who)
 280                         uid = cred->uid;
 281                 else if (!uid_eq(uid, cred->uid)) {
 282                         user = find_user(uid);
 283                         if (!user)
 284                                 goto out_unlock;        /* No processes for this user */
 285                 }
 286                 do_each_thread(g, p) {
 287                         if (uid_eq(task_uid(p), uid)) {
 288                                 niceval = nice_to_rlimit(task_nice(p));
 289                                 if (niceval > retval)
 290                                         retval = niceval;
 291                         }
 292                 } while_each_thread(g, p);
 293                 if (!uid_eq(uid, cred->uid))
 294                         free_uid(user);         /* for find_user() */
 295                 break;
 296         }
 297 out_unlock:
 298         read_unlock(&tasklist_lock);
 299         rcu_read_unlock();
 300
 301         return retval;
 302 }
 303
 304 /*
 305  * Unprivileged users may change the real gid to the effective gid
 306  * or vice versa.  (BSD-style)
 307  *
 308  * If you set the real gid at all, or set the effective gid to a value not
 309  * equal to the real gid, then the saved gid is set to the new effective gid.
 310  *
 311  * This makes it possible for a setgid program to completely drop its
 312  * privileges, which is often a useful assertion to make when you are doing
 313  * a security audit over a program.
 314  *
 315  * The general idea is that a program which uses just setregid() will be
 316  * 100% compatible with BSD.  A program which uses just setgid() will be
 317  * 100% compatible with POSIX with saved IDs.
 318  *
 319  * SMP: There are not races, the GIDs are checked only by filesystem
 320  *      operations (as far as semantic preservation is concerned).
 321  */
 322 SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 323 {
 324         struct user_namespace *ns = current_user_ns();
 325         const struct cred *old;
 326         struct cred *new;
 327         int retval;
 328         kgid_t krgid, kegid;
 329
 330         krgid = make_kgid(ns, rgid);
 331         kegid = make_kgid(ns, egid);
 332
 333         if ((rgid != (gid_t) -1) && !gid_valid(krgid))
 334                 return -EINVAL;
 335         if ((egid != (gid_t) -1) && !gid_valid(kegid))
 336                 return -EINVAL;
 337
 338         new = prepare_creds();
 339         if (!new)
 340                 return -ENOMEM;
 341         old = current_cred();
 342
 343         retval = -EPERM;
 344         if (rgid != (gid_t) -1) {
 345                 if (gid_eq(old->gid, krgid) ||
 346                     gid_eq(old->egid, krgid) ||
 347                     ns_capable(old->user_ns, CAP_SETGID))
 348                         new->gid = krgid;
 349                 else
 350                         goto error;
 351         }
 352         if (egid != (gid_t) -1) {
 353                 if (gid_eq(old->gid, kegid) ||
 354                     gid_eq(old->egid, kegid) ||
 355                     gid_eq(old->sgid, kegid) ||
 356                     ns_capable(old->user_ns, CAP_SETGID))
 357                         new->egid = kegid;
 358                 else
 359                         goto error;
 360         }
 361
 362         if (rgid != (gid_t) -1 ||
 363             (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
 364                 new->sgid = new->egid;
 365         new->fsgid = new->egid;
 366
 367         return commit_creds(new);
 368
 369 error:
 370         abort_creds(new);
 371         return retval;
 372 }
 373
 374 /*
 375  * setgid() is implemented like SysV w/ SAVED_IDS
 376  *
 377  * SMP: Same implicit races as above.
 378  */
 379 SYSCALL_DEFINE1(setgid, gid_t, gid)
 380 {
 381         struct user_namespace *ns = current_user_ns();
 382         const struct cred *old;
 383         struct cred *new;
 384         int retval;
 385         kgid_t kgid;
 386
 387         kgid = make_kgid(ns, gid);
 388         if (!gid_valid(kgid))
 389                 return -EINVAL;
 390
 391         new = prepare_creds();
 392         if (!new)
 393                 return -ENOMEM;
 394         old = current_cred();
 395
 396         retval = -EPERM;
 397         if (ns_capable(old->user_ns, CAP_SETGID))
 398                 new->gid = new->egid = new->sgid = new->fsgid = kgid;
 399         else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
 400                 new->egid = new->fsgid = kgid;
 401         else
 402                 goto error;
 403
 404         return commit_creds(new);
 405
 406 error:
 407         abort_creds(new);
 408         return retval;
 409 }
 410
 411 /*
 412  * change the user struct in a credentials set to match the new UID
 413  */
 414 static int set_user(struct cred *new)
 415 {
 416         struct user_struct *new_user;
 417
 418         new_user = alloc_uid(new->uid);
 419         if (!new_user)
 420                 return -EAGAIN;
 421
 422         /*
 423          * We don't fail in case of NPROC limit excess here because too many
 424          * poorly written programs don't check set*uid() return code, assuming
 425          * it never fails if called by root.  We may still enforce NPROC limit
 426          * for programs doing set*uid()+execve() by harmlessly deferring the
 427          * failure to the execve() stage.
 428          */
 429         if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
 430                         new_user != INIT_USER)
 431                 current->flags |= PF_NPROC_EXCEEDED;
 432         else
 433                 current->flags &= ~PF_NPROC_EXCEEDED;
 434
 435         free_uid(new->user);
 436         new->user = new_user;
 437         return 0;
 438 }
 439
 440 /*
 441  * Unprivileged users may change the real uid to the effective uid
 442  * or vice versa.  (BSD-style)
 443  *
 444  * If you set the real uid at all, or set the effective uid to a value not
 445  * equal to the real uid, then the saved uid is set to the new effective uid.
 446  *
 447  * This makes it possible for a setuid program to completely drop its
 448  * privileges, which is often a useful assertion to make when you are doing
 449  * a security audit over a program.
 450  *
 451  * The general idea is that a program which uses just setreuid() will be
 452  * 100% compatible with BSD.  A program which uses just setuid() will be
 453  * 100% compatible with POSIX with saved IDs.
 454  */
 455 SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 456 {
 457         struct user_namespace *ns = current_user_ns();
 458         const struct cred *old;
 459         struct cred *new;
 460         int retval;
 461         kuid_t kruid, keuid;
 462
 463         kruid = make_kuid(ns, ruid);
 464         keuid = make_kuid(ns, euid);
 465
 466         if ((ruid != (uid_t) -1) && !uid_valid(kruid))
 467                 return -EINVAL;
 468         if ((euid != (uid_t) -1) && !uid_valid(keuid))
 469                 return -EINVAL;
 470
 471         new = prepare_creds();
 472         if (!new)
 473                 return -ENOMEM;
 474         old = current_cred();
 475
 476         retval = -EPERM;
 477         if (ruid != (uid_t) -1) {
 478                 new->uid = kruid;
 479                 if (!uid_eq(old->uid, kruid) &&
 480                     !uid_eq(old->euid, kruid) &&
 481                     !ns_capable(old->user_ns, CAP_SETUID))
 482                         goto error;
 483         }
 484
 485         if (euid != (uid_t) -1) {
 486                 new->euid = keuid;
 487                 if (!uid_eq(old->uid, keuid) &&
 488                     !uid_eq(old->euid, keuid) &&
 489                     !uid_eq(old->suid, keuid) &&
 490                     !ns_capable(old->user_ns, CAP_SETUID))
 491                         goto error;
 492         }
 493
 494         if (!uid_eq(new->uid, old->uid)) {
 495                 retval = set_user(new);
 496                 if (retval < 0)
 497                         goto error;
 498         }
 499         if (ruid != (uid_t) -1 ||
 500             (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
 501                 new->suid = new->euid;
 502         new->fsuid = new->euid;
 503
 504         retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
 505         if (retval < 0)
 506                 goto error;
 507
 508         return commit_creds(new);
 509
 510 error:
 511         abort_creds(new);
 512         return retval;
 513 }
 514
 515 /*
 516  * setuid() is implemented like SysV with SAVED_IDS
 517  *
 518  * Note that SAVED_ID's is deficient in that a setuid root program
 519  * like sendmail, for example, cannot set its uid to be a normal
 520  * user and then switch back, because if you're root, setuid() sets
 521  * the saved uid too.  If you don't like this, blame the bright people
 522  * in the POSIX committee and/or USG.  Note that the BSD-style setreuid()
 523  * will allow a root program to temporarily drop privileges and be able to
 524  * regain them by swapping the real and effective uid.
 525  */
 526 SYSCALL_DEFINE1(setuid, uid_t, uid)
 527 {
 528         struct user_namespace *ns = current_user_ns();
 529         const struct cred *old;
 530         struct cred *new;
 531         int retval;
 532         kuid_t kuid;
 533
 534         kuid = make_kuid(ns, uid);
 535         if (!uid_valid(kuid))
 536                 return -EINVAL;
 537
 538         new = prepare_creds();
 539         if (!new)
 540                 return -ENOMEM;
 541         old = current_cred();
 542
 543         retval = -EPERM;
 544         if (ns_capable(old->user_ns, CAP_SETUID)) {
 545                 new->suid = new->uid = kuid;
 546                 if (!uid_eq(kuid, old->uid)) {
 547                         retval = set_user(new);
 548                         if (retval < 0)
 549                                 goto error;
 550                 }
 551         } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
 552                 goto error;
 553         }
 554
 555         new->fsuid = new->euid = kuid;
 556
 557         retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
 558         if (retval < 0)
 559                 goto error;
 560
 561         return commit_creds(new);
 562
 563 error:
 564         abort_creds(new);
 565         return retval;
 566 }
 567
 568
 569 /*
 570  * This function implements a generic ability to update ruid, euid,
 571  * and suid.  This allows you to implement the 4.4 compatible seteuid().
 572  */
 573 SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 574 {
 575         struct user_namespace *ns = current_user_ns();
 576         const struct cred *old;
 577         struct cred *new;
 578         int retval;
 579         kuid_t kruid, keuid, ksuid;
 580
 581         kruid = make_kuid(ns, ruid);
 582         keuid = make_kuid(ns, euid);
 583         ksuid = make_kuid(ns, suid);
 584
 585         if ((ruid != (uid_t) -1) && !uid_valid(kruid))
 586                 return -EINVAL;
 587
 588         if ((euid != (uid_t) -1) && !uid_valid(keuid))
 589                 return -EINVAL;
 590
 591         if ((suid != (uid_t) -1) && !uid_valid(ksuid))
 592                 return -EINVAL;
 593
 594         new = prepare_creds();
 595         if (!new)
 596                 return -ENOMEM;
 597
 598         old = current_cred();
 599
 600         retval = -EPERM;
 601         if (!ns_capable(old->user_ns, CAP_SETUID)) {
 602                 if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
 603                     !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
 604                         goto error;
 605                 if (euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
 606                     !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
 607                         goto error;
 608                 if (suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
 609                     !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
 610                         goto error;
 611         }
 612
 613         if (ruid != (uid_t) -1) {
 614                 new->uid = kruid;
 615                 if (!uid_eq(kruid, old->uid)) {
 616                         retval = set_user(new);
 617                         if (retval < 0)
 618                                 goto error;
 619                 }
 620         }
 621         if (euid != (uid_t) -1)
 622                 new->euid = keuid;
 623         if (suid != (uid_t) -1)
 624                 new->suid = ksuid;
 625         new->fsuid = new->euid;
 626
 627         retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
 628         if (retval < 0)
 629                 goto error;
 630
 631         return commit_creds(new);
 632
 633 error:
 634         abort_creds(new);
 635         return retval;
 636 }
 637
 638 SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
 639 {
 640         const struct cred *cred = current_cred();
 641         int retval;
 642         uid_t ruid, euid, suid;
 643
 644         ruid = from_kuid_munged(cred->user_ns, cred->uid);
 645         euid = from_kuid_munged(cred->user_ns, cred->euid);
 646         suid = from_kuid_munged(cred->user_ns, cred->suid);
 647
 648         retval = put_user(ruid, ruidp);
 649         if (!retval) {
 650                 retval = put_user(euid, euidp);
 651                 if (!retval)
 652                         return put_user(suid, suidp);
 653         }
 654         return retval;
 655 }
 656
 657 /*
 658  * Same as above, but for rgid, egid, sgid.
 659  */
 660 SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 661 {
 662         struct user_namespace *ns = current_user_ns();
 663         const struct cred *old;
 664         struct cred *new;
 665         int retval;
 666         kgid_t krgid, kegid, ksgid;
 667
 668         krgid = make_kgid(ns, rgid);
 669         kegid = make_kgid(ns, egid);
 670         ksgid = make_kgid(ns, sgid);
 671
 672         if ((rgid != (gid_t) -1) && !gid_valid(krgid))
 673                 return -EINVAL;
 674         if ((egid != (gid_t) -1) && !gid_valid(kegid))
 675                 return -EINVAL;
 676         if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
 677                 return -EINVAL;
 678
 679         new = prepare_creds();
 680         if (!new)
 681                 return -ENOMEM;
 682         old = current_cred();
 683
 684         retval = -EPERM;
 685         if (!ns_capable(old->user_ns, CAP_SETGID)) {
 686                 if (rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
 687                     !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
 688                         goto error;
 689                 if (egid != (gid_t) -1        && !gid_eq(kegid, old->gid) &&
 690                     !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
 691                         goto error;
 692                 if (sgid != (gid_t) -1        && !gid_eq(ksgid, old->gid) &&
 693                     !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
 694                         goto error;
 695         }
 696
 697         if (rgid != (gid_t) -1)
 698                 new->gid = krgid;
 699         if (egid != (gid_t) -1)
 700                 new->egid = kegid;
 701         if (sgid != (gid_t) -1)
 702                 new->sgid = ksgid;
 703         new->fsgid = new->egid;
 704
 705         return commit_creds(new);
 706
 707 error:
 708         abort_creds(new);
 709         return retval;
 710 }
 711
 712 SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
 713 {
 714         const struct cred *cred = current_cred();
 715         int retval;
 716         gid_t rgid, egid, sgid;
 717
 718         rgid = from_kgid_munged(cred->user_ns, cred->gid);
 719         egid = from_kgid_munged(cred->user_ns, cred->egid);
 720         sgid = from_kgid_munged(cred->user_ns, cred->sgid);
 721
 722         retval = put_user(rgid, rgidp);
 723         if (!retval) {
 724                 retval = put_user(egid, egidp);
 725                 if (!retval)
 726                         retval = put_user(sgid, sgidp);
 727         }
 728
 729         return retval;
 730 }
 731
 732
 733 /*
 734  * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
 735  * is used for "access()" and for the NFS daemon (letting nfsd stay at
 736  * whatever uid it wants to). It normally shadows "euid", except when
 737  * explicitly set by setfsuid() or for access..
 738  */
 739 SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 740 {
 741         const struct cred *old;
 742         struct cred *new;
 743         uid_t old_fsuid;
 744         kuid_t kuid;
 745
 746         old = current_cred();
 747         old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
 748
 749         kuid = make_kuid(old->user_ns, uid);
 750         if (!uid_valid(kuid))
 751                 return old_fsuid;
 752
 753         new = prepare_creds();
 754         if (!new)
 755                 return old_fsuid;
 756
 757         if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
 758             uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
 759             ns_capable(old->user_ns, CAP_SETUID)) {
 760                 if (!uid_eq(kuid, old->fsuid)) {
 761                         new->fsuid = kuid;
 762                         if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
 763                                 goto change_okay;
 764                 }
 765         }
 766
 767         abort_creds(new);
 768         return old_fsuid;
 769
 770 change_okay:
 771         commit_creds(new);
 772         return old_fsuid;
 773 }
 774
 775 /*
 776  * Samma på svenska..
 777  */
 778 SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 779 {
 780         const struct cred *old;
 781         struct cred *new;
 782         gid_t old_fsgid;
 783         kgid_t kgid;
 784
 785         old = current_cred();
 786         old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
 787
 788         kgid = make_kgid(old->user_ns, gid);
 789         if (!gid_valid(kgid))
 790                 return old_fsgid;
 791
 792         new = prepare_creds();
 793         if (!new)
 794                 return old_fsgid;
 795
 796         if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
 797             gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
 798             ns_capable(old->user_ns, CAP_SETGID)) {
 799                 if (!gid_eq(kgid, old->fsgid)) {
 800                         new->fsgid = kgid;
 801                         goto change_okay;
 802                 }
 803         }
 804
 805         abort_creds(new);
 806         return old_fsgid;
 807
 808 change_okay:
 809         commit_creds(new);
 810         return old_fsgid;
 811 }
 812
 813 /**
 814  * sys_getpid - return the thread group id of the current process
 815  *
 816  * Note, despite the name, this returns the tgid not the pid.  The tgid and
 817  * the pid are identical unless CLONE_THREAD was specified on clone() in
 818  * which case the tgid is the same in all threads of the same group.
 819  *
 820  * This is SMP safe as current->tgid does not change.
 821  */
 822 SYSCALL_DEFINE0(getpid)
 823 {
 824         return task_tgid_vnr(current);
 825 }
 826
 827 /* Thread ID - the internal kernel "pid" */
 828 SYSCALL_DEFINE0(gettid)
 829 {
 830         return task_pid_vnr(current);
 831 }
 832
 833 /*
 834  * Accessing ->real_parent is not SMP-safe, it could
 835  * change from under us. However, we can use a stale
 836  * value of ->real_parent under rcu_read_lock(), see
 837  * release_task()->call_rcu(delayed_put_task_struct).
 838  */
 839 SYSCALL_DEFINE0(getppid)
 840 {
 841         int pid;
 842
 843         rcu_read_lock();
 844         pid = task_tgid_vnr(rcu_dereference(current->real_parent));
 845         rcu_read_unlock();
 846
 847         return pid;
 848 }
 849
 850 SYSCALL_DEFINE0(getuid)
 851 {
 852         /* Only we change this so SMP safe */
 853         return from_kuid_munged(current_user_ns(), current_uid());
 854 }
 855
 856 SYSCALL_DEFINE0(geteuid)
 857 {
 858         /* Only we change this so SMP safe */
 859         return from_kuid_munged(current_user_ns(), current_euid());
 860 }
 861
 862 SYSCALL_DEFINE0(getgid)
 863 {
 864         /* Only we change this so SMP safe */
 865         return from_kgid_munged(current_user_ns(), current_gid());
 866 }
 867
 868 SYSCALL_DEFINE0(getegid)
 869 {
 870         /* Only we change this so SMP safe */
 871         return from_kgid_munged(current_user_ns(), current_egid());
 872 }
 873
 874 void do_sys_times(struct tms *tms)
 875 {
 876         cputime_t tgutime, tgstime, cutime, cstime;
 877
 878         thread_group_cputime_adjusted(current, &tgutime, &tgstime);
 879         cutime = current->signal->cutime;
 880         cstime = current->signal->cstime;
 881         tms->tms_utime = cputime_to_clock_t(tgutime);
 882         tms->tms_stime = cputime_to_clock_t(tgstime);
 883         tms->tms_cutime = cputime_to_clock_t(cutime);
 884         tms->tms_cstime = cputime_to_clock_t(cstime);
 885 }
 886
 887 SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
 888 {
 889         if (tbuf) {
 890                 struct tms tmp;
 891
 892                 do_sys_times(&tmp);
 893                 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
 894                         return -EFAULT;
 895         }
 896         force_successful_syscall_return();
 897         return (long) jiffies_64_to_clock_t(get_jiffies_64());
 898 }
 899
 900 /*
 901  * This needs some heavy checking ...
 902  * I just haven't the stomach for it. I also don't fully
 903  * understand sessions/pgrp etc. Let somebody who does explain it.
 904  *
 905  * OK, I think I have the protection semantics right.... this is really
 906  * only important on a multi-user system anyway, to make sure one user
 907  * can't send a signal to a process owned by another.  -TYT, 12/12/91
 908  *
 909  * !PF_FORKNOEXEC check to conform completely to POSIX.
 910  */
 911 SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 912 {
 913         struct task_struct *p;
 914         struct task_struct *group_leader = current->group_leader;
 915         struct pid *pgrp;
 916         int err;
 917
 918         if (!pid)
 919                 pid = task_pid_vnr(group_leader);
 920         if (!pgid)
 921                 pgid = pid;
 922         if (pgid < 0)
 923                 return -EINVAL;
 924         rcu_read_lock();
 925
 926         /* From this point forward we keep holding onto the tasklist lock
 927          * so that our parent does not change from under us. -DaveM
 928          */
 929         write_lock_irq(&tasklist_lock);
 930
 931         err = -ESRCH;
 932         p = find_task_by_vpid(pid);
 933         if (!p)
 934                 goto out;
 935
 936         err = -EINVAL;
 937         if (!thread_group_leader(p))
 938                 goto out;
 939
 940         if (same_thread_group(p->real_parent, group_leader)) {
 941                 err = -EPERM;
 942                 if (task_session(p) != task_session(group_leader))
 943                         goto out;
 944                 err = -EACCES;
 945                 if (!(p->flags & PF_FORKNOEXEC))
 946                         goto out;
 947         } else {
 948                 err = -ESRCH;
 949                 if (p != group_leader)
 950                         goto out;
 951         }
 952
 953         err = -EPERM;
 954         if (p->signal->leader)
 955                 goto out;
 956
 957         pgrp = task_pid(p);
 958         if (pgid != pid) {
 959                 struct task_struct *g;
 960
 961                 pgrp = find_vpid(pgid);
 962                 g = pid_task(pgrp, PIDTYPE_PGID);
 963                 if (!g || task_session(g) != task_session(group_leader))
 964                         goto out;
 965         }
 966
 967         err = security_task_setpgid(p, pgid);
 968         if (err)
 969                 goto out;
 970
 971         if (task_pgrp(p) != pgrp)
 972                 change_pid(p, PIDTYPE_PGID, pgrp);
 973
 974         err = 0;
 975 out:
 976         /* All paths lead to here, thus we are safe. -DaveM */
 977         write_unlock_irq(&tasklist_lock);
 978         rcu_read_unlock();
 979         return err;
 980 }
 981
 982 SYSCALL_DEFINE1(getpgid, pid_t, pid)
 983 {
 984         struct task_struct *p;
 985         struct pid *grp;
 986         int retval;
 987
 988         rcu_read_lock();
 989         if (!pid)
 990                 grp = task_pgrp(current);
 991         else {
 992                 retval = -ESRCH;
 993                 p = find_task_by_vpid(pid);
 994                 if (!p)
 995                         goto out;
 996                 grp = task_pgrp(p);
 997                 if (!grp)
 998                         goto out;
 999
1000                 retval = security_task_getpgid(p);
1001                 if (retval)
1002                         goto out;
1003         }
1004         retval = pid_vnr(grp);
1005 out:
1006         rcu_read_unlock();
1007         return retval;
1008 }
1009
1010 #ifdef __ARCH_WANT_SYS_GETPGRP
1011
1012 SYSCALL_DEFINE0(getpgrp)
1013 {
1014         return sys_getpgid(0);
1015 }
1016
1017 #endif
1018
1019 SYSCALL_DEFINE1(getsid, pid_t, pid)
1020 {
1021         struct task_struct *p;
1022         struct pid *sid;
1023         int retval;
1024
1025         rcu_read_lock();
1026         if (!pid)
1027                 sid = task_session(current);
1028         else {
1029                 retval = -ESRCH;
1030                 p = find_task_by_vpid(pid);
1031                 if (!p)
1032                         goto out;
1033                 sid = task_session(p);
1034                 if (!sid)
1035                         goto out;
1036
1037                 retval = security_task_getsid(p);
1038                 if (retval)
1039                         goto out;
1040         }
1041         retval = pid_vnr(sid);
1042 out:
1043         rcu_read_unlock();
1044         return retval;
1045 }
1046
1047 static void set_special_pids(struct pid *pid)
1048 {
1049         struct task_struct *curr = current->group_leader;
1050
1051         if (task_session(curr) != pid)
1052                 change_pid(curr, PIDTYPE_SID, pid);
1053
1054         if (task_pgrp(curr) != pid)
1055                 change_pid(curr, PIDTYPE_PGID, pid);
1056 }
1057
1058 SYSCALL_DEFINE0(setsid)
1059 {
1060         struct task_struct *group_leader = current->group_leader;
1061         struct pid *sid = task_pid(group_leader);
1062         pid_t session = pid_vnr(sid);
1063         int err = -EPERM;
1064
1065         write_lock_irq(&tasklist_lock);
1066         /* Fail if I am already a session leader */
1067         if (group_leader->signal->leader)
1068                 goto out;
1069
1070         /* Fail if a process group id already exists that equals the
1071          * proposed session id.
1072          */
1073         if (pid_task(sid, PIDTYPE_PGID))
1074                 goto out;
1075
1076         group_leader->signal->leader = 1;
1077         set_special_pids(sid);
1078
1079         proc_clear_tty(group_leader);
1080
1081         err = session;
1082 out:
1083         write_unlock_irq(&tasklist_lock);
1084         if (err > 0) {
1085                 proc_sid_connector(group_leader);
1086                 sched_autogroup_create_attach(group_leader);
1087         }
1088         return err;
1089 }
1090
1091 DECLARE_RWSEM(uts_sem);
1092
1093 #ifdef COMPAT_UTS_MACHINE
1094 #define override_architecture(name) \
1095         (personality(current->personality) == PER_LINUX32 && \
1096          copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1097                       sizeof(COMPAT_UTS_MACHINE)))
1098 #else
1099 #define override_architecture(name)     0
1100 #endif
1101
1102 /*
1103  * Work around broken programs that cannot handle "Linux 3.0".
1104  * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
1105  */
1106 static int override_release(char __user *release, size_t len)
1107 {
1108         int ret = 0;
1109
1110         if (current->personality & UNAME26) {
1111                 const char *rest = UTS_RELEASE;
1112                 char buf[65] = { 0 };
1113                 int ndots = 0;
1114                 unsigned v;
1115                 size_t copy;
1116
1117                 while (*rest) {
1118                         if (*rest == '.' && ++ndots >= 3)
1119                                 break;
1120                         if (!isdigit(*rest) && *rest != '.')
1121                                 break;
1122                         rest++;
1123                 }
1124                 v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
1125                 copy = clamp_t(size_t, len, 1, sizeof(buf));
1126                 copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
1127                 ret = copy_to_user(release, buf, copy + 1);
1128         }
1129         return ret;
1130 }
1131
1132 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1133 {
1134         int errno = 0;
1135
1136         down_read(&uts_sem);
1137         if (copy_to_user(name, utsname(), sizeof *name))
1138                 errno = -EFAULT;
1139         up_read(&uts_sem);
1140
1141         if (!errno && override_release(name->release, sizeof(name->release)))
1142                 errno = -EFAULT;
1143         if (!errno && override_architecture(name))
1144                 errno = -EFAULT;
1145         return errno;
1146 }
1147
1148 #ifdef __ARCH_WANT_SYS_OLD_UNAME
1149 /*
1150  * Old cruft
1151  */
1152 SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1153 {
1154         int error = 0;
1155
1156         if (!name)
1157                 return -EFAULT;
1158
1159         down_read(&uts_sem);
1160         if (copy_to_user(name, utsname(), sizeof(*name)))
1161                 error = -EFAULT;
1162         up_read(&uts_sem);
1163
1164         if (!error && override_release(name->release, sizeof(name->release)))
1165                 error = -EFAULT;
1166         if (!error && override_architecture(name))
1167                 error = -EFAULT;
1168         return error;
1169 }
1170
1171 SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1172 {
1173         int error;
1174
1175         if (!name)
1176                 return -EFAULT;
1177         if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
1178                 return -EFAULT;
1179
1180         down_read(&uts_sem);
1181         error = __copy_to_user(&name->sysname, &utsname()->sysname,
1182                                __OLD_UTS_LEN);
1183         error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
1184         error |= __copy_to_user(&name->nodename, &utsname()->nodename,
1185                                 __OLD_UTS_LEN);
1186         error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
1187         error |= __copy_to_user(&name->release, &utsname()->release,
1188                                 __OLD_UTS_LEN);
1189         error |= __put_user(0, name->release + __OLD_UTS_LEN);
1190         error |= __copy_to_user(&name->version, &utsname()->version,
1191                                 __OLD_UTS_LEN);
1192         error |= __put_user(0, name->version + __OLD_UTS_LEN);
1193         error |= __copy_to_user(&name->machine, &utsname()->machine,
1194                                 __OLD_UTS_LEN);
1195         error |= __put_user(0, name->machine + __OLD_UTS_LEN);
1196         up_read(&uts_sem);
1197
1198         if (!error && override_architecture(name))
1199                 error = -EFAULT;
1200         if (!error && override_release(name->release, sizeof(name->release)))
1201                 error = -EFAULT;
1202         return error ? -EFAULT : 0;
1203 }
1204 #endif
1205
1206 SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1207 {
1208         int errno;
1209         char tmp[__NEW_UTS_LEN];
1210
1211         if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1212                 return -EPERM;
1213
1214         if (len < 0 || len > __NEW_UTS_LEN)
1215                 return -EINVAL;
1216         down_write(&uts_sem);
1217         errno = -EFAULT;
1218         if (!copy_from_user(tmp, name, len)) {
1219                 struct new_utsname *u = utsname();
1220
1221                 memcpy(u->nodename, tmp, len);
1222                 memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1223                 errno = 0;
1224                 uts_proc_notify(UTS_PROC_HOSTNAME);
1225         }
1226         up_write(&uts_sem);
1227         return errno;
1228 }
1229
1230 #ifdef __ARCH_WANT_SYS_GETHOSTNAME
1231
1232 SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
1233 {
1234         int i, errno;
1235         struct new_utsname *u;
1236
1237         if (len < 0)
1238                 return -EINVAL;
1239         down_read(&uts_sem);
1240         u = utsname();
1241         i = 1 + strlen(u->nodename);
1242         if (i > len)
1243                 i = len;
1244         errno = 0;
1245         if (copy_to_user(name, u->nodename, i))
1246                 errno = -EFAULT;
1247         up_read(&uts_sem);
1248         return errno;
1249 }
1250
1251 #endif
1252
1253 /*
1254  * Only setdomainname; getdomainname can be implemented by calling
1255  * uname()
1256  */
1257 SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1258 {
1259         int errno;
1260         char tmp[__NEW_UTS_LEN];
1261
1262         if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
1263                 return -EPERM;
1264         if (len < 0 || len > __NEW_UTS_LEN)
1265                 return -EINVAL;
1266
1267         down_write(&uts_sem);
1268         errno = -EFAULT;
1269         if (!copy_from_user(tmp, name, len)) {
1270                 struct new_utsname *u = utsname();
1271
1272                 memcpy(u->domainname, tmp, len);
1273                 memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1274                 errno = 0;
1275                 uts_proc_notify(UTS_PROC_DOMAINNAME);
1276         }
1277         up_write(&uts_sem);
1278         return errno;
1279 }
1280
1281 SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1282 {
1283         struct rlimit value;
1284         int ret;
1285
1286         ret = do_prlimit(current, resource, NULL, &value);
1287         if (!ret)
1288                 ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
1289
1290         return ret;
1291 }
1292
1293 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
1294
1295 /*
1296  *      Back compatibility for getrlimit. Needed for some apps.
1297  */
1298 SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1299                 struct rlimit __user *, rlim)
1300 {
1301         struct rlimit x;
1302         if (resource >= RLIM_NLIMITS)
1303                 return -EINVAL;
1304
1305         task_lock(current->group_leader);
1306         x = current->signal->rlim[resource];
1307         task_unlock(current->group_leader);
1308         if (x.rlim_cur > 0x7FFFFFFF)
1309                 x.rlim_cur = 0x7FFFFFFF;
1310         if (x.rlim_max > 0x7FFFFFFF)
1311                 x.rlim_max = 0x7FFFFFFF;
1312         return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
1313 }
1314
1315 #endif
1316
1317 static inline bool rlim64_is_infinity(__u64 rlim64)
1318 {
1319 #if BITS_PER_LONG < 64
1320         return rlim64 >= ULONG_MAX;
1321 #else
1322         return rlim64 == RLIM64_INFINITY;
1323 #endif
1324 }
1325
1326 static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
1327 {
1328         if (rlim->rlim_cur == RLIM_INFINITY)
1329                 rlim64->rlim_cur = RLIM64_INFINITY;
1330         else
1331                 rlim64->rlim_cur = rlim->rlim_cur;
1332         if (rlim->rlim_max == RLIM_INFINITY)
1333                 rlim64->rlim_max = RLIM64_INFINITY;
1334         else
1335                 rlim64->rlim_max = rlim->rlim_max;
1336 }
1337
1338 static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
1339 {
1340         if (rlim64_is_infinity(rlim64->rlim_cur))
1341                 rlim->rlim_cur = RLIM_INFINITY;
1342         else
1343                 rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
1344         if (rlim64_is_infinity(rlim64->rlim_max))
1345                 rlim->rlim_max = RLIM_INFINITY;
1346         else
1347                 rlim->rlim_max = (unsigned long)rlim64->rlim_max;
1348 }
1349
1350 /* make sure you are allowed to change @tsk limits before calling this */
1351 int do_prlimit(struct task_struct *tsk, unsigned int resource,
1352                 struct rlimit *new_rlim, struct rlimit *old_rlim)
1353 {
1354         struct rlimit *rlim;
1355         int retval = 0;
1356
1357         if (resource >= RLIM_NLIMITS)
1358                 return -EINVAL;
1359         if (new_rlim) {
1360                 if (new_rlim->rlim_cur > new_rlim->rlim_max)
1361                         return -EINVAL;
1362                 if (resource == RLIMIT_NOFILE &&
1363                                 new_rlim->rlim_max > sysctl_nr_open)
1364                         return -EPERM;
1365         }
1366
1367         /* protect tsk->signal and tsk->sighand from disappearing */
1368         read_lock(&tasklist_lock);
1369         if (!tsk->sighand) {
1370                 retval = -ESRCH;
1371                 goto out;
1372         }
1373
1374         rlim = tsk->signal->rlim + resource;
1375         task_lock(tsk->group_leader);
1376         if (new_rlim) {
1377                 /* Keep the capable check against init_user_ns until
1378                    cgroups can contain all limits */
1379                 if (new_rlim->rlim_max > rlim->rlim_max &&
1380                                 !capable(CAP_SYS_RESOURCE))
1381                         retval = -EPERM;
1382                 if (!retval)
1383                         retval = security_task_setrlimit(tsk->group_leader,
1384                                         resource, new_rlim);
1385                 if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
1386                         /*
1387                          * The caller is asking for an immediate RLIMIT_CPU
1388                          * expiry.  But we use the zero value to mean "it was
1389                          * never set".  So let's cheat and make it one second
1390                          * instead
1391                          */
1392                         new_rlim->rlim_cur = 1;
1393                 }
1394         }
1395         if (!retval) {
1396                 if (old_rlim)
1397                         *old_rlim = *rlim;
1398                 if (new_rlim)
1399                         *rlim = *new_rlim;
1400         }
1401         task_unlock(tsk->group_leader);
1402
1403         /*
1404          * RLIMIT_CPU handling.   Note that the kernel fails to return an error
1405          * code if it rejected the user's attempt to set RLIMIT_CPU.  This is a
1406          * very long-standing error, and fixing it now risks breakage of
1407          * applications, so we live with it
1408          */
1409          if (!retval && new_rlim && resource == RLIMIT_CPU &&
1410                          new_rlim->rlim_cur != RLIM_INFINITY)
1411                 update_rlimit_cpu(tsk, new_rlim->rlim_cur);
1412 out:
1413         read_unlock(&tasklist_lock);
1414         return retval;
1415 }
1416
1417 /* rcu lock must be held */
1418 static int check_prlimit_permission(struct task_struct *task)
1419 {
1420         const struct cred *cred = current_cred(), *tcred;
1421
1422         if (current == task)
1423                 return 0;
1424
1425         tcred = __task_cred(task);
1426         if (uid_eq(cred->uid, tcred->euid) &&
1427             uid_eq(cred->uid, tcred->suid) &&
1428             uid_eq(cred->uid, tcred->uid)  &&
1429             gid_eq(cred->gid, tcred->egid) &&
1430             gid_eq(cred->gid, tcred->sgid) &&
1431             gid_eq(cred->gid, tcred->gid))
1432                 return 0;
1433         if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
1434                 return 0;
1435
1436         return -EPERM;
1437 }
1438
1439 SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
1440                 const struct rlimit64 __user *, new_rlim,
1441                 struct rlimit64 __user *, old_rlim)
1442 {
1443         struct rlimit64 old64, new64;
1444         struct rlimit old, new;
1445         struct task_struct *tsk;
1446         int ret;
1447
1448         if (new_rlim) {
1449                 if (copy_from_user(&new64, new_rlim, sizeof(new64)))
1450                         return -EFAULT;
1451                 rlim64_to_rlim(&new64, &new);
1452         }
1453
1454         rcu_read_lock();
1455         tsk = pid ? find_task_by_vpid(pid) : current;
1456         if (!tsk) {
1457                 rcu_read_unlock();
1458                 return -ESRCH;
1459         }
1460         ret = check_prlimit_permission(tsk);
1461         if (ret) {
1462                 rcu_read_unlock();
1463                 return ret;
1464         }
1465         get_task_struct(tsk);
1466         rcu_read_unlock();
1467
1468         ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
1469                         old_rlim ? &old : NULL);
1470
1471         if (!ret && old_rlim) {
1472                 rlim_to_rlim64(&old, &old64);
1473                 if (copy_to_user(old_rlim, &old64, sizeof(old64)))
1474                         ret = -EFAULT;
1475         }
1476
1477         put_task_struct(tsk);
1478         return ret;
1479 }
1480
1481 SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1482 {
1483         struct rlimit new_rlim;
1484
1485         if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1486                 return -EFAULT;
1487         return do_prlimit(current, resource, &new_rlim, NULL);
1488 }
1489
1490 /*
1491  * It would make sense to put struct rusage in the task_struct,
1492  * except that would make the task_struct be *really big*.  After
1493  * task_struct gets moved into malloc'ed memory, it would
1494  * make sense to do this.  It will make moving the rest of the information
1495  * a lot simpler!  (Which we're not doing right now because we're not
1496  * measuring them yet).
1497  *
1498  * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
1499  * races with threads incrementing their own counters.  But since word
1500  * reads are atomic, we either get new values or old values and we don't
1501  * care which for the sums.  We always take the siglock to protect reading
1502  * the c* fields from p->signal from races with exit.c updating those
1503  * fields when reaping, so a sample either gets all the additions of a
1504  * given child after it's reaped, or none so this sample is before reaping.
1505  *
1506  * Locking:
1507  * We need to take the siglock for CHILDEREN, SELF and BOTH
1508  * for  the cases current multithreaded, non-current single threaded
1509  * non-current multithreaded.  Thread traversal is now safe with
1510  * the siglock held.
1511  * Strictly speaking, we donot need to take the siglock if we are current and
1512  * single threaded,  as no one else can take our signal_struct away, no one
1513  * else can  reap the  children to update signal->c* counters, and no one else
1514  * can race with the signal-> fields. If we do not take any lock, the
1515  * signal-> fields could be read out of order while another thread was just
1516  * exiting. So we should  place a read memory barrier when we avoid the lock.
1517  * On the writer side,  write memory barrier is implied in  __exit_signal
1518  * as __exit_signal releases  the siglock spinlock after updating the signal->
1519  * fields. But we don't do this yet to keep things simple.
1520  *
1521  */
1522
1523 static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
1524 {
1525         r->ru_nvcsw += t->nvcsw;
1526         r->ru_nivcsw += t->nivcsw;
1527         r->ru_minflt += t->min_flt;
1528         r->ru_majflt += t->maj_flt;
1529         r->ru_inblock += task_io_get_inblock(t);
1530         r->ru_oublock += task_io_get_oublock(t);
1531 }
1532
1533 static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1534 {
1535         struct task_struct *t;
1536         unsigned long flags;
1537         cputime_t tgutime, tgstime, utime, stime;
1538         unsigned long maxrss = 0;
1539
1540         memset((char *)r, 0, sizeof (*r));
1541         utime = stime = 0;
1542
1543         if (who == RUSAGE_THREAD) {
1544                 task_cputime_adjusted(current, &utime, &stime);
1545                 accumulate_thread_rusage(p, r);
1546                 maxrss = p->signal->maxrss;
1547                 goto out;
1548         }
1549
1550         if (!lock_task_sighand(p, &flags))
1551                 return;
1552
1553         switch (who) {
1554         case RUSAGE_BOTH:
1555         case RUSAGE_CHILDREN:
1556                 utime = p->signal->cutime;
1557                 stime = p->signal->cstime;
1558                 r->ru_nvcsw = p->signal->cnvcsw;
1559                 r->ru_nivcsw = p->signal->cnivcsw;
1560                 r->ru_minflt = p->signal->cmin_flt;
1561                 r->ru_majflt = p->signal->cmaj_flt;
1562                 r->ru_inblock = p->signal->cinblock;
1563                 r->ru_oublock = p->signal->coublock;
1564                 maxrss = p->signal->cmaxrss;
1565
1566                 if (who == RUSAGE_CHILDREN)
1567                         break;
1568
1569         case RUSAGE_SELF:
1570                 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1571                 utime += tgutime;
1572                 stime += tgstime;
1573                 r->ru_nvcsw += p->signal->nvcsw;
1574                 r->ru_nivcsw += p->signal->nivcsw;
1575                 r->ru_minflt += p->signal->min_flt;
1576                 r->ru_majflt += p->signal->maj_flt;
1577                 r->ru_inblock += p->signal->inblock;
1578                 r->ru_oublock += p->signal->oublock;
1579                 if (maxrss < p->signal->maxrss)
1580                         maxrss = p->signal->maxrss;
1581                 t = p;
1582                 do {
1583                         accumulate_thread_rusage(t, r);
1584                 } while_each_thread(p, t);
1585                 break;
1586
1587         default:
1588                 BUG();
1589         }
1590         unlock_task_sighand(p, &flags);
1591
1592 out:
1593         cputime_to_timeval(utime, &r->ru_utime);
1594         cputime_to_timeval(stime, &r->ru_stime);
1595
1596         if (who != RUSAGE_CHILDREN) {
1597                 struct mm_struct *mm = get_task_mm(p);
1598
1599                 if (mm) {
1600                         setmax_mm_hiwater_rss(&maxrss, mm);
1601                         mmput(mm);
1602                 }
1603         }
1604         r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
1605 }
1606
1607 int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1608 {
1609         struct rusage r;
1610
1611         k_getrusage(p, who, &r);
1612         return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1613 }
1614
1615 SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1616 {
1617         if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1618             who != RUSAGE_THREAD)
1619                 return -EINVAL;
1620         return getrusage(current, who, ru);
1621 }
1622
1623 #ifdef CONFIG_COMPAT
1624 COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
1625 {
1626         struct rusage r;
1627
1628         if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1629             who != RUSAGE_THREAD)
1630                 return -EINVAL;
1631
1632         k_getrusage(current, who, &r);
1633         return put_compat_rusage(&r, ru);
1634 }
1635 #endif
1636
1637 SYSCALL_DEFINE1(umask, int, mask)
1638 {
1639         mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
1640         return mask;
1641 }
1642
1643 static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
1644 {
1645         struct fd exe;
1646         struct inode *inode;
1647         int err;
1648
1649         VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
1650
1651         exe = fdget(fd);
1652         if (!exe.file)
1653                 return -EBADF;
1654
1655         inode = file_inode(exe.file);
1656
1657         /*
1658          * Because the original mm->exe_file points to executable file, make
1659          * sure that this one is executable as well, to avoid breaking an
1660          * overall picture.
1661          */
1662         err = -EACCES;
1663         if (!S_ISREG(inode->i_mode)     ||
1664             exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1665                 goto exit;
1666
1667         err = inode_permission(inode, MAY_EXEC);
1668         if (err)
1669                 goto exit;
1670
1671         /*
1672          * Forbid mm->exe_file change if old file still mapped.
1673          */
1674         err = -EBUSY;
1675         if (mm->exe_file) {
1676                 struct vm_area_struct *vma;
1677
1678                 for (vma = mm->mmap; vma; vma = vma->vm_next)
1679                         if (vma->vm_file &&
1680                             path_equal(&vma->vm_file->f_path,
1681                                        &mm->exe_file->f_path))
1682                                 goto exit;
1683         }
1684
1685         /*
1686          * The symlink can be changed only once, just to disallow arbitrary
1687          * transitions malicious software might bring in. This means one
1688          * could make a snapshot over all processes running and monitor
1689          * /proc/pid/exe changes to notice unusual activity if needed.
1690          */
1691         err = -EPERM;
1692         if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1693                 goto exit;
1694
1695         err = 0;
1696         set_mm_exe_file(mm, exe.file);  /* this grabs a reference to exe.file */
1697 exit:
1698         fdput(exe);
1699         return err;
1700 }
1701
1702 #ifdef CONFIG_CHECKPOINT_RESTORE
1703 /*
1704  * WARNING: we don't require any capability here so be very careful
1705  * in what is allowed for modification from userspace.
1706  */
1707 static int validate_prctl_map(struct prctl_mm_map *prctl_map)
1708 {
1709         unsigned long mmap_max_addr = TASK_SIZE;
1710         struct mm_struct *mm = current->mm;
1711         int error = -EINVAL, i;
1712
1713         static const unsigned char offsets[] = {
1714                 offsetof(struct prctl_mm_map, start_code),
1715                 offsetof(struct prctl_mm_map, end_code),
1716                 offsetof(struct prctl_mm_map, start_data),
1717                 offsetof(struct prctl_mm_map, end_data),
1718                 offsetof(struct prctl_mm_map, start_brk),
1719                 offsetof(struct prctl_mm_map, brk),
1720                 offsetof(struct prctl_mm_map, start_stack),
1721                 offsetof(struct prctl_mm_map, arg_start),
1722                 offsetof(struct prctl_mm_map, arg_end),
1723                 offsetof(struct prctl_mm_map, env_start),
1724                 offsetof(struct prctl_mm_map, env_end),
1725         };
1726
1727         /*
1728          * Make sure the members are not somewhere outside
1729          * of allowed address space.
1730          */
1731         for (i = 0; i < ARRAY_SIZE(offsets); i++) {
1732                 u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
1733
1734                 if ((unsigned long)val >= mmap_max_addr ||
1735                     (unsigned long)val < mmap_min_addr)
1736                         goto out;
1737         }
1738
1739         /*
1740          * Make sure the pairs are ordered.
1741          */
1742 #define __prctl_check_order(__m1, __op, __m2)                           \
1743         ((unsigned long)prctl_map->__m1 __op                            \
1744          (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
1745         error  = __prctl_check_order(start_code, <, end_code);
1746         error |= __prctl_check_order(start_data, <, end_data);
1747         error |= __prctl_check_order(start_brk, <=, brk);
1748         error |= __prctl_check_order(arg_start, <=, arg_end);
1749         error |= __prctl_check_order(env_start, <=, env_end);
1750         if (error)
1751                 goto out;
1752 #undef __prctl_check_order
1753
1754         error = -EINVAL;
1755
1756         /*
1757          * @brk should be after @end_data in traditional maps.
1758          */
1759         if (prctl_map->start_brk <= prctl_map->end_data ||
1760             prctl_map->brk <= prctl_map->end_data)
1761                 goto out;
1762
1763         /*
1764          * Neither we should allow to override limits if they set.
1765          */
1766         if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
1767                               prctl_map->start_brk, prctl_map->end_data,
1768                               prctl_map->start_data))
1769                         goto out;
1770
1771         /*
1772          * Someone is trying to cheat the auxv vector.
1773          */
1774         if (prctl_map->auxv_size) {
1775                 if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
1776                         goto out;
1777         }
1778
1779         /*
1780          * Finally, make sure the caller has the rights to
1781          * change /proc/pid/exe link: only local root should
1782          * be allowed to.
1783          */
1784         if (prctl_map->exe_fd != (u32)-1) {
1785                 struct user_namespace *ns = current_user_ns();
1786                 const struct cred *cred = current_cred();
1787
1788                 if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
1789                     !gid_eq(cred->gid, make_kgid(ns, 0)))
1790                         goto out;
1791         }
1792
1793         error = 0;
1794 out:
1795         return error;
1796 }
1797
1798 static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
1799 {
1800         struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
1801         unsigned long user_auxv[AT_VECTOR_SIZE];
1802         struct mm_struct *mm = current->mm;
1803         int error;
1804
1805         BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
1806         BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
1807
1808         if (opt == PR_SET_MM_MAP_SIZE)
1809                 return put_user((unsigned int)sizeof(prctl_map),
1810                                 (unsigned int __user *)addr);
1811
1812         if (data_size != sizeof(prctl_map))
1813                 return -EINVAL;
1814
1815         if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
1816                 return -EFAULT;
1817
1818         error = validate_prctl_map(&prctl_map);
1819         if (error)
1820                 return error;
1821
1822         if (prctl_map.auxv_size) {
1823                 memset(user_auxv, 0, sizeof(user_auxv));
1824                 if (copy_from_user(user_auxv,
1825                                    (const void __user *)prctl_map.auxv,
1826                                    prctl_map.auxv_size))
1827                         return -EFAULT;
1828
1829                 /* Last entry must be AT_NULL as specification requires */
1830                 user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
1831                 user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
1832         }
1833
1834         down_write(&mm->mmap_sem);
1835         if (prctl_map.exe_fd != (u32)-1)
1836                 error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
1837         downgrade_write(&mm->mmap_sem);
1838         if (error)
1839                 goto out;
1840
1841         /*
1842          * We don't validate if these members are pointing to
1843          * real present VMAs because application may have correspond
1844          * VMAs already unmapped and kernel uses these members for statistics
1845          * output in procfs mostly, except
1846          *
1847          *  - @start_brk/@brk which are used in do_brk but kernel lookups
1848          *    for VMAs when updating these memvers so anything wrong written
1849          *    here cause kernel to swear at userspace program but won't lead
1850          *    to any problem in kernel itself
1851          */
1852
1853         mm->start_code  = prctl_map.start_code;
1854         mm->end_code    = prctl_map.end_code;
1855         mm->start_data  = prctl_map.start_data;
1856         mm->end_data    = prctl_map.end_data;
1857         mm->start_brk   = prctl_map.start_brk;
1858         mm->brk         = prctl_map.brk;
1859         mm->start_stack = prctl_map.start_stack;
1860         mm->arg_start   = prctl_map.arg_start;
1861         mm->arg_end     = prctl_map.arg_end;
1862         mm->env_start   = prctl_map.env_start;
1863         mm->env_end     = prctl_map.env_end;
1864
1865         /*
1866          * Note this update of @saved_auxv is lockless thus
1867          * if someone reads this member in procfs while we're
1868          * updating -- it may get partly updated results. It's
1869          * known and acceptable trade off: we leave it as is to
1870          * not introduce additional locks here making the kernel
1871          * more complex.
1872          */
1873         if (prctl_map.auxv_size)
1874                 memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
1875
1876         error = 0;
1877 out:
1878         up_read(&mm->mmap_sem);
1879         return error;
1880 }
1881 #endif /* CONFIG_CHECKPOINT_RESTORE */
1882
1883 static int prctl_set_mm(int opt, unsigned long addr,
1884                         unsigned long arg4, unsigned long arg5)
1885 {
1886         struct mm_struct *mm = current->mm;
1887         struct vm_area_struct *vma;
1888         int error;
1889
1890         if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
1891                               opt != PR_SET_MM_MAP &&
1892                               opt != PR_SET_MM_MAP_SIZE)))
1893                 return -EINVAL;
1894
1895 #ifdef CONFIG_CHECKPOINT_RESTORE
1896         if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
1897                 return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
1898 #endif
1899
1900         if (!capable(CAP_SYS_RESOURCE))
1901                 return -EPERM;
1902
1903         if (opt == PR_SET_MM_EXE_FILE) {
1904                 down_write(&mm->mmap_sem);
1905                 error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
1906                 up_write(&mm->mmap_sem);
1907                 return error;
1908         }
1909
1910         if (addr >= TASK_SIZE || addr < mmap_min_addr)
1911                 return -EINVAL;
1912
1913         error = -EINVAL;
1914
1915         down_read(&mm->mmap_sem);
1916         vma = find_vma(mm, addr);
1917
1918         switch (opt) {
1919         case PR_SET_MM_START_CODE:
1920                 mm->start_code = addr;
1921                 break;
1922         case PR_SET_MM_END_CODE:
1923                 mm->end_code = addr;
1924                 break;
1925         case PR_SET_MM_START_DATA:
1926                 mm->start_data = addr;
1927                 break;
1928         case PR_SET_MM_END_DATA:
1929                 mm->end_data = addr;
1930                 break;
1931
1932         case PR_SET_MM_START_BRK:
1933                 if (addr <= mm->end_data)
1934                         goto out;
1935
1936                 if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
1937                                       mm->end_data, mm->start_data))
1938                         goto out;
1939
1940                 mm->start_brk = addr;
1941                 break;
1942
1943         case PR_SET_MM_BRK:
1944                 if (addr <= mm->end_data)
1945                         goto out;
1946
1947                 if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
1948                                       mm->end_data, mm->start_data))
1949                         goto out;
1950
1951                 mm->brk = addr;
1952                 break;
1953
1954         /*
1955          * If command line arguments and environment
1956          * are placed somewhere else on stack, we can
1957          * set them up here, ARG_START/END to setup
1958          * command line argumets and ENV_START/END
1959          * for environment.
1960          */
1961         case PR_SET_MM_START_STACK:
1962         case PR_SET_MM_ARG_START:
1963         case PR_SET_MM_ARG_END:
1964         case PR_SET_MM_ENV_START:
1965         case PR_SET_MM_ENV_END:
1966                 if (!vma) {
1967                         error = -EFAULT;
1968                         goto out;
1969                 }
1970                 if (opt == PR_SET_MM_START_STACK)
1971                         mm->start_stack = addr;
1972                 else if (opt == PR_SET_MM_ARG_START)
1973                         mm->arg_start = addr;
1974                 else if (opt == PR_SET_MM_ARG_END)
1975                         mm->arg_end = addr;
1976                 else if (opt == PR_SET_MM_ENV_START)
1977                         mm->env_start = addr;
1978                 else if (opt == PR_SET_MM_ENV_END)
1979                         mm->env_end = addr;
1980                 break;
1981
1982         /*
1983          * This doesn't move auxiliary vector itself
1984          * since it's pinned to mm_struct, but allow
1985          * to fill vector with new values. It's up
1986          * to a caller to provide sane values here
1987          * otherwise user space tools which use this
1988          * vector might be unhappy.
1989          */
1990         case PR_SET_MM_AUXV: {
1991                 unsigned long user_auxv[AT_VECTOR_SIZE];
1992
1993                 if (arg4 > sizeof(user_auxv))
1994                         goto out;
1995                 up_read(&mm->mmap_sem);
1996
1997                 if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
1998                         return -EFAULT;
1999
2000                 /* Make sure the last entry is always AT_NULL */
2001                 user_auxv[AT_VECTOR_SIZE - 2] = 0;
2002                 user_auxv[AT_VECTOR_SIZE - 1] = 0;
2003
2004                 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
2005
2006                 task_lock(current);
2007                 memcpy(mm->saved_auxv, user_auxv, arg4);
2008                 task_unlock(current);
2009
2010                 return 0;
2011         }
2012         default:
2013                 goto out;
2014         }
2015
2016         error = 0;
2017 out:
2018         up_read(&mm->mmap_sem);
2019         return error;
2020 }
2021
2022 #ifdef CONFIG_CHECKPOINT_RESTORE
2023 static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
2024 {
2025         return put_user(me->clear_child_tid, tid_addr);
2026 }
2027 #else
2028 static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
2029 {
2030         return -EINVAL;
2031 }
2032 #endif
2033
2034 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2035                 unsigned long, arg4, unsigned long, arg5)
2036 {
2037         struct task_struct *me = current;
2038         unsigned char comm[sizeof(me->comm)];
2039         long error;
2040
2041         error = security_task_prctl(option, arg2, arg3, arg4, arg5);
2042         if (error != -ENOSYS)
2043                 return error;
2044
2045         error = 0;
2046         switch (option) {
2047         case PR_SET_PDEATHSIG:
2048                 if (!valid_signal(arg2)) {
2049                         error = -EINVAL;
2050                         break;
2051                 }
2052                 me->pdeath_signal = arg2;
2053                 break;
2054         case PR_GET_PDEATHSIG:
2055                 error = put_user(me->pdeath_signal, (int __user *)arg2);
2056                 break;
2057         case PR_GET_DUMPABLE:
2058                 error = get_dumpable(me->mm);
2059                 break;
2060         case PR_SET_DUMPABLE:
2061                 if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
2062                         error = -EINVAL;
2063                         break;
2064                 }
2065                 set_dumpable(me->mm, arg2);
2066                 break;
2067
2068         case PR_SET_UNALIGN:
2069                 error = SET_UNALIGN_CTL(me, arg2);
2070                 break;
2071         case PR_GET_UNALIGN:
2072                 error = GET_UNALIGN_CTL(me, arg2);
2073                 break;
2074         case PR_SET_FPEMU:
2075                 error = SET_FPEMU_CTL(me, arg2);
2076                 break;
2077         case PR_GET_FPEMU:
2078                 error = GET_FPEMU_CTL(me, arg2);
2079                 break;
2080         case PR_SET_FPEXC:
2081                 error = SET_FPEXC_CTL(me, arg2);
2082                 break;
2083         case PR_GET_FPEXC:
2084                 error = GET_FPEXC_CTL(me, arg2);
2085                 break;
2086         case PR_GET_TIMING:
2087                 error = PR_TIMING_STATISTICAL;
2088                 break;
2089         case PR_SET_TIMING:
2090                 if (arg2 != PR_TIMING_STATISTICAL)
2091                         error = -EINVAL;
2092                 break;
2093         case PR_SET_NAME:
2094                 comm[sizeof(me->comm) - 1] = 0;
2095                 if (strncpy_from_user(comm, (char __user *)arg2,
2096                                       sizeof(me->comm) - 1) < 0)
2097                         return -EFAULT;
2098                 set_task_comm(me, comm);
2099                 proc_comm_connector(me);
2100                 break;
2101         case PR_GET_NAME:
2102                 get_task_comm(comm, me);
2103                 if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
2104                         return -EFAULT;
2105                 break;
2106         case PR_GET_ENDIAN:
2107                 error = GET_ENDIAN(me, arg2);
2108                 break;
2109         case PR_SET_ENDIAN:
2110                 error = SET_ENDIAN(me, arg2);
2111                 break;
2112         case PR_GET_SECCOMP:
2113                 error = prctl_get_seccomp();
2114                 break;
2115         case PR_SET_SECCOMP:
2116                 error = prctl_set_seccomp(arg2, (char __user *)arg3);
2117                 break;
2118         case PR_GET_TSC:
2119                 error = GET_TSC_CTL(arg2);
2120                 break;
2121         case PR_SET_TSC:
2122                 error = SET_TSC_CTL(arg2);
2123                 break;
2124         case PR_TASK_PERF_EVENTS_DISABLE:
2125                 error = perf_event_task_disable();
2126                 break;
2127         case PR_TASK_PERF_EVENTS_ENABLE:
2128                 error = perf_event_task_enable();
2129                 break;
2130         case PR_GET_TIMERSLACK:
2131                 error = current->timer_slack_ns;
2132                 break;
2133         case PR_SET_TIMERSLACK:
2134                 if (arg2 <= 0)
2135                         current->timer_slack_ns =
2136                                         current->default_timer_slack_ns;
2137                 else
2138                         current->timer_slack_ns = arg2;
2139                 break;
2140         case PR_MCE_KILL:
2141                 if (arg4 | arg5)
2142                         return -EINVAL;
2143                 switch (arg2) {
2144                 case PR_MCE_KILL_CLEAR:
2145                         if (arg3 != 0)
2146                                 return -EINVAL;
2147                         current->flags &= ~PF_MCE_PROCESS;
2148                         break;
2149                 case PR_MCE_KILL_SET:
2150                         current->flags |= PF_MCE_PROCESS;
2151                         if (arg3 == PR_MCE_KILL_EARLY)
2152                                 current->flags |= PF_MCE_EARLY;
2153                         else if (arg3 == PR_MCE_KILL_LATE)
2154                                 current->flags &= ~PF_MCE_EARLY;
2155                         else if (arg3 == PR_MCE_KILL_DEFAULT)
2156                                 current->flags &=
2157                                                 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
2158                         else
2159                                 return -EINVAL;
2160                         break;
2161                 default:
2162                         return -EINVAL;
2163                 }
2164                 break;
2165         case PR_MCE_KILL_GET:
2166                 if (arg2 | arg3 | arg4 | arg5)
2167                         return -EINVAL;
2168                 if (current->flags & PF_MCE_PROCESS)
2169                         error = (current->flags & PF_MCE_EARLY) ?
2170                                 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
2171                 else
2172                         error = PR_MCE_KILL_DEFAULT;
2173                 break;
2174         case PR_SET_MM:
2175                 error = prctl_set_mm(arg2, arg3, arg4, arg5);
2176                 break;
2177         case PR_GET_TID_ADDRESS:
2178                 error = prctl_get_tid_address(me, (int __user **)arg2);
2179                 break;
2180         case PR_SET_CHILD_SUBREAPER:
2181                 me->signal->is_child_subreaper = !!arg2;
2182                 break;
2183         case PR_GET_CHILD_SUBREAPER:
2184                 error = put_user(me->signal->is_child_subreaper,
2185                                  (int __user *)arg2);
2186                 break;
2187         case PR_SET_NO_NEW_PRIVS:
2188                 if (arg2 != 1 || arg3 || arg4 || arg5)
2189                         return -EINVAL;
2190
2191                 task_set_no_new_privs(current);
2192                 break;
2193         case PR_GET_NO_NEW_PRIVS:
2194                 if (arg2 || arg3 || arg4 || arg5)
2195                         return -EINVAL;
2196                 return task_no_new_privs(current) ? 1 : 0;
2197         case PR_GET_THP_DISABLE:
2198                 if (arg2 || arg3 || arg4 || arg5)
2199                         return -EINVAL;
2200                 error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
2201                 break;
2202         case PR_SET_THP_DISABLE:
2203                 if (arg3 || arg4 || arg5)
2204                         return -EINVAL;
2205                 down_write(&me->mm->mmap_sem);
2206                 if (arg2)
2207                         me->mm->def_flags |= VM_NOHUGEPAGE;
2208                 else
2209                         me->mm->def_flags &= ~VM_NOHUGEPAGE;
2210                 up_write(&me->mm->mmap_sem);
2211                 break;
2212         case PR_MPX_ENABLE_MANAGEMENT:
2213                 if (arg2 || arg3 || arg4 || arg5)
2214                         return -EINVAL;
2215                 error = MPX_ENABLE_MANAGEMENT(me);
2216                 break;
2217         case PR_MPX_DISABLE_MANAGEMENT:
2218                 if (arg2 || arg3 || arg4 || arg5)
2219                         return -EINVAL;
2220                 error = MPX_DISABLE_MANAGEMENT(me);
2221                 break;
2222         default:
2223                 error = -EINVAL;
2224                 break;
2225         }
2226         return error;
2227 }
2228
2229 SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2230                 struct getcpu_cache __user *, unused)
2231 {
2232         int err = 0;
2233         int cpu = raw_smp_processor_id();
2234
2235         if (cpup)
2236                 err |= put_user(cpu, cpup);
2237         if (nodep)
2238                 err |= put_user(cpu_to_node(cpu), nodep);
2239         return err ? -EFAULT : 0;
2240 }
2241
2242 /**
2243  * do_sysinfo - fill in sysinfo struct
2244  * @info: pointer to buffer to fill
2245  */
2246 static int do_sysinfo(struct sysinfo *info)
2247 {
2248         unsigned long mem_total, sav_total;
2249         unsigned int mem_unit, bitcount;
2250         struct timespec tp;
2251
2252         memset(info, 0, sizeof(struct sysinfo));
2253
2254         get_monotonic_boottime(&tp);
2255         info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
2256
2257         get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
2258
2259         info->procs = nr_threads;
2260
2261         si_meminfo(info);
2262         si_swapinfo(info);
2263
2264         /*
2265          * If the sum of all the available memory (i.e. ram + swap)
2266          * is less than can be stored in a 32 bit unsigned long then
2267          * we can be binary compatible with 2.2.x kernels.  If not,
2268          * well, in that case 2.2.x was broken anyways...
2269          *
2270          *  -Erik Andersen <andersee@debian.org>
2271          */
2272
2273         mem_total = info->totalram + info->totalswap;
2274         if (mem_total < info->totalram || mem_total < info->totalswap)
2275                 goto out;
2276         bitcount = 0;
2277         mem_unit = info->mem_unit;
2278         while (mem_unit > 1) {
2279                 bitcount++;
2280                 mem_unit >>= 1;
2281                 sav_total = mem_total;
2282                 mem_total <<= 1;
2283                 if (mem_total < sav_total)
2284                         goto out;
2285         }
2286
2287         /*
2288          * If mem_total did not overflow, multiply all memory values by
2289          * info->mem_unit and set it to 1.  This leaves things compatible
2290          * with 2.2.x, and also retains compatibility with earlier 2.4.x
2291          * kernels...
2292          */
2293
2294         info->mem_unit = 1;
2295         info->totalram <<= bitcount;
2296         info->freeram <<= bitcount;
2297         info->sharedram <<= bitcount;
2298         info->bufferram <<= bitcount;
2299         info->totalswap <<= bitcount;
2300         info->freeswap <<= bitcount;
2301         info->totalhigh <<= bitcount;
2302         info->freehigh <<= bitcount;
2303
2304 out:
2305         return 0;
2306 }
2307
2308 SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
2309 {
2310         struct sysinfo val;
2311
2312         do_sysinfo(&val);
2313
2314         if (copy_to_user(info, &val, sizeof(struct sysinfo)))
2315                 return -EFAULT;
2316
2317         return 0;
2318 }
2319
2320 #ifdef CONFIG_COMPAT
2321 struct compat_sysinfo {
2322         s32 uptime;
2323         u32 loads[3];
2324         u32 totalram;
2325         u32 freeram;
2326         u32 sharedram;
2327         u32 bufferram;
2328         u32 totalswap;
2329         u32 freeswap;
2330         u16 procs;
2331         u16 pad;
2332         u32 totalhigh;
2333         u32 freehigh;
2334         u32 mem_unit;
2335         char _f[20-2*sizeof(u32)-sizeof(int)];
2336 };
2337
2338 COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
2339 {
2340         struct sysinfo s;
2341
2342         do_sysinfo(&s);
2343
2344         /* Check to see if any memory value is too large for 32-bit and scale
2345          *  down if needed
2346          */
2347         if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
2348                 int bitcount = 0;
2349
2350                 while (s.mem_unit < PAGE_SIZE) {
2351                         s.mem_unit <<= 1;
2352                         bitcount++;
2353                 }
2354
2355                 s.totalram >>= bitcount;
2356                 s.freeram >>= bitcount;
2357                 s.sharedram >>= bitcount;
2358                 s.bufferram >>= bitcount;
2359                 s.totalswap >>= bitcount;
2360                 s.freeswap >>= bitcount;
2361                 s.totalhigh >>= bitcount;
2362                 s.freehigh >>= bitcount;
2363         }
2364
2365         if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
2366             __put_user(s.uptime, &info->uptime) ||
2367             __put_user(s.loads[0], &info->loads[0]) ||
2368             __put_user(s.loads[1], &info->loads[1]) ||
2369             __put_user(s.loads[2], &info->loads[2]) ||
2370             __put_user(s.totalram, &info->totalram) ||
2371             __put_user(s.freeram, &info->freeram) ||
2372             __put_user(s.sharedram, &info->sharedram) ||
2373             __put_user(s.bufferram, &info->bufferram) ||
2374             __put_user(s.totalswap, &info->totalswap) ||
2375             __put_user(s.freeswap, &info->freeswap) ||
2376             __put_user(s.procs, &info->procs) ||
2377             __put_user(s.totalhigh, &info->totalhigh) ||
2378             __put_user(s.freehigh, &info->freehigh) ||
2379             __put_user(s.mem_unit, &info->mem_unit))
2380                 return -EFAULT;
2381
2382         return 0;
2383 }
2384 #endif /* CONFIG_COMPAT */