diff options
Diffstat (limited to 'libdde-linux26/lib/src/kernel')
-rw-r--r-- | libdde-linux26/lib/src/kernel/capability.c | 323 | ||||
-rw-r--r-- | libdde-linux26/lib/src/kernel/cred-internals.h | 21 | ||||
-rw-r--r-- | libdde-linux26/lib/src/kernel/exit.c | 1850 | ||||
-rw-r--r-- | libdde-linux26/lib/src/kernel/irq/handle.c | 23 | ||||
-rw-r--r-- | libdde-linux26/lib/src/kernel/resource.c | 936 | ||||
-rw-r--r-- | libdde-linux26/lib/src/kernel/sched.c | 9654 | ||||
-rw-r--r-- | libdde-linux26/lib/src/kernel/sched_cpupri.h | 37 | ||||
-rw-r--r-- | libdde-linux26/lib/src/kernel/sys.c | 1893 | ||||
-rw-r--r-- | libdde-linux26/lib/src/kernel/time.c | 765 | ||||
-rwxr-xr-x | libdde-linux26/lib/src/kernel/timeconst.pl | 378 | ||||
-rw-r--r-- | libdde-linux26/lib/src/kernel/timer.c | 1590 | ||||
-rw-r--r-- | libdde-linux26/lib/src/kernel/wait.c | 301 | ||||
-rw-r--r-- | libdde-linux26/lib/src/kernel/workqueue.c | 1038 |
13 files changed, 18809 insertions, 0 deletions
diff --git a/libdde-linux26/lib/src/kernel/capability.c b/libdde-linux26/lib/src/kernel/capability.c new file mode 100644 index 00000000..c269aa7c --- /dev/null +++ b/libdde-linux26/lib/src/kernel/capability.c @@ -0,0 +1,323 @@ +/* + * linux/kernel/capability.c + * + * Copyright (C) 1997 Andrew Main <zefram@fysh.org> + * + * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org> + * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> + */ + +#include <linux/audit.h> +#include <linux/capability.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/security.h> +#include <linux/syscalls.h> +#include <linux/pid_namespace.h> +#include <asm/uaccess.h> +#include "cred-internals.h" + +#ifndef DDE_LINUX +/* + * This lock protects task->cap_* for all tasks including current. + * Locking rule: acquire this prior to tasklist_lock. + */ +static DEFINE_SPINLOCK(task_capability_lock); + +/* + * Leveraged for setting/resetting capabilities + */ + +const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; +const kernel_cap_t __cap_full_set = CAP_FULL_SET; +const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; + +EXPORT_SYMBOL(__cap_empty_set); +EXPORT_SYMBOL(__cap_full_set); +EXPORT_SYMBOL(__cap_init_eff_set); + +#ifdef CONFIG_SECURITY_FILE_CAPABILITIES +int file_caps_enabled = 1; + +static int __init file_caps_disable(char *str) +{ + file_caps_enabled = 0; + return 1; +} +__setup("no_file_caps", file_caps_disable); +#endif + +/* + * More recent versions of libcap are available from: + * + * http://www.kernel.org/pub/linux/libs/security/linux-privs/ + */ + +static void warn_legacy_capability_use(void) +{ + static int warned; + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses 32-bit capabilities" + " (legacy support in use)\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* + * Version 2 capabilities worked fine, but the linux/capability.h file + * that accompanied their introduction encouraged their use without + * the necessary user-space source code changes. As such, we have + * created a version 3 with equivalent functionality to version 2, but + * with a header change to protect legacy source code from using + * version 2 when it wanted to use version 1. If your system has code + * that trips the following warning, it is using version 2 specific + * capabilities and may be doing so insecurely. + * + * The remedy is to either upgrade your version of libcap (to 2.10+, + * if the application is linked against it), or recompile your + * application with modern kernel headers and this warning will go + * away. + */ + +static void warn_deprecated_v2(void) +{ + static int warned; + + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses deprecated v2" + " capabilities in a way that may be insecure.\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* + * Version check. Return the number of u32s in each capability flag + * array, or a negative value on error. + */ +static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) +{ + __u32 version; + + if (get_user(version, &header->version)) + return -EFAULT; + + switch (version) { + case _LINUX_CAPABILITY_VERSION_1: + warn_legacy_capability_use(); + *tocopy = _LINUX_CAPABILITY_U32S_1; + break; + case _LINUX_CAPABILITY_VERSION_2: + warn_deprecated_v2(); + /* + * fall through - v3 is otherwise equivalent to v2. + */ + case _LINUX_CAPABILITY_VERSION_3: + *tocopy = _LINUX_CAPABILITY_U32S_3; + break; + default: + if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) + return -EFAULT; + return -EINVAL; + } + + return 0; +} + +/* + * The only thing that can change the capabilities of the current + * process is the current process. As such, we can't be in this code + * at the same time as we are in the process of setting capabilities + * in this process. The net result is that we can limit our use of + * locks to when we are reading the caps of another process. + */ +static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, + kernel_cap_t *pIp, kernel_cap_t *pPp) +{ + int ret; + + if (pid && (pid != task_pid_vnr(current))) { + struct task_struct *target; + + read_lock(&tasklist_lock); + + target = find_task_by_vpid(pid); + if (!target) + ret = -ESRCH; + else + ret = security_capget(target, pEp, pIp, pPp); + + read_unlock(&tasklist_lock); + } else + ret = security_capget(current, pEp, pIp, pPp); + + return ret; +} + +/** + * sys_capget - get the capabilities of a given process. + * @header: pointer to struct that contains capability version and + * target pid data + * @dataptr: pointer to struct that contains the effective, permitted, + * and inheritable capabilities that are returned + * + * Returns 0 on success and < 0 on error. + */ +SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) +{ + int ret = 0; + pid_t pid; + unsigned tocopy; + kernel_cap_t pE, pI, pP; + + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; + + if (get_user(pid, &header->pid)) + return -EFAULT; + + if (pid < 0) + return -EINVAL; + + ret = cap_get_target_pid(pid, &pE, &pI, &pP); + if (!ret) { + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; + unsigned i; + + for (i = 0; i < tocopy; i++) { + kdata[i].effective = pE.cap[i]; + kdata[i].permitted = pP.cap[i]; + kdata[i].inheritable = pI.cap[i]; + } + + /* + * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, + * we silently drop the upper capabilities here. This + * has the effect of making older libcap + * implementations implicitly drop upper capability + * bits when they perform a: capget/modify/capset + * sequence. + * + * This behavior is considered fail-safe + * behavior. Upgrading the application to a newer + * version of libcap will enable access to the newer + * capabilities. + * + * An alternative would be to return an error here + * (-ERANGE), but that causes legacy applications to + * unexpectidly fail; the capget/modify/capset aborts + * before modification is attempted and the application + * fails. + */ + if (copy_to_user(dataptr, kdata, tocopy + * sizeof(struct __user_cap_data_struct))) { + return -EFAULT; + } + } + + return ret; +} + +/** + * sys_capset - set capabilities for a process or (*) a group of processes + * @header: pointer to struct that contains capability version and + * target pid data + * @data: pointer to struct that contains the effective, permitted, + * and inheritable capabilities + * + * Set capabilities for the current process only. The ability to any other + * process(es) has been deprecated and removed. + * + * The restrictions on setting capabilities are specified as: + * + * I: any raised capabilities must be a subset of the old permitted + * P: any raised capabilities must be a subset of the old permitted + * E: must be set to a subset of new permitted + * + * Returns 0 on success and < 0 on error. + */ +SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) +{ + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; + unsigned i, tocopy; + kernel_cap_t inheritable, permitted, effective; + struct cred *new; + int ret; + pid_t pid; + + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; + + if (get_user(pid, &header->pid)) + return -EFAULT; + + /* may only affect current now */ + if (pid != 0 && pid != task_pid_vnr(current)) + return -EPERM; + + if (copy_from_user(&kdata, data, + tocopy * sizeof(struct __user_cap_data_struct))) + return -EFAULT; + + for (i = 0; i < tocopy; i++) { + effective.cap[i] = kdata[i].effective; + permitted.cap[i] = kdata[i].permitted; + inheritable.cap[i] = kdata[i].inheritable; + } + while (i < _KERNEL_CAPABILITY_U32S) { + effective.cap[i] = 0; + permitted.cap[i] = 0; + inheritable.cap[i] = 0; + i++; + } + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = security_capset(new, current_cred(), + &effective, &inheritable, &permitted); + if (ret < 0) + goto error; + + audit_log_capset(pid, new, current_cred()); + + return commit_creds(new); + +error: + abort_creds(new); + return ret; +} +#endif /* !DDE_LINUX */ + +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +int capable(int cap) +{ + if (unlikely(!cap_valid(cap))) { + printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); + BUG(); + } + + if (security_capable(cap) == 0) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} +EXPORT_SYMBOL(capable); diff --git a/libdde-linux26/lib/src/kernel/cred-internals.h b/libdde-linux26/lib/src/kernel/cred-internals.h new file mode 100644 index 00000000..2dc4fc2d --- /dev/null +++ b/libdde-linux26/lib/src/kernel/cred-internals.h @@ -0,0 +1,21 @@ +/* Internal credentials stuff + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +/* + * user.c + */ +static inline void sched_switch_user(struct task_struct *p) +{ +#ifdef CONFIG_USER_SCHED + sched_move_task(p); +#endif /* CONFIG_USER_SCHED */ +} + diff --git a/libdde-linux26/lib/src/kernel/exit.c b/libdde-linux26/lib/src/kernel/exit.c new file mode 100644 index 00000000..703f9aab --- /dev/null +++ b/libdde-linux26/lib/src/kernel/exit.c @@ -0,0 +1,1850 @@ +/* + * linux/kernel/exit.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/capability.h> +#include <linux/completion.h> +#include <linux/personality.h> +#include <linux/tty.h> +#include <linux/mnt_namespace.h> +#include <linux/iocontext.h> +#include <linux/key.h> +#include <linux/security.h> +#include <linux/cpu.h> +#include <linux/acct.h> +#include <linux/tsacct_kern.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/binfmts.h> +#include <linux/nsproxy.h> +#include <linux/pid_namespace.h> +#include <linux/ptrace.h> +#include <linux/profile.h> +#include <linux/mount.h> +#include <linux/proc_fs.h> +#include <linux/kthread.h> +#include <linux/mempolicy.h> +#include <linux/taskstats_kern.h> +#include <linux/delayacct.h> +#include <linux/freezer.h> +#include <linux/cgroup.h> +#include <linux/syscalls.h> +#include <linux/signal.h> +#include <linux/posix-timers.h> +#include <linux/cn_proc.h> +#include <linux/mutex.h> +#include <linux/futex.h> +#include <linux/pipe_fs_i.h> +#include <linux/audit.h> /* for audit_free() */ +#include <linux/resource.h> +#include <linux/blkdev.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/tracehook.h> +#include <linux/init_task.h> +#include <trace/sched.h> + +#include <asm/uaccess.h> +#include <asm/unistd.h> +#include <asm/pgtable.h> +#include <asm/mmu_context.h> +#include "cred-internals.h" + +DEFINE_TRACE(sched_process_free); +DEFINE_TRACE(sched_process_exit); +DEFINE_TRACE(sched_process_wait); + +#ifndef DDE_LINUX +static void exit_mm(struct task_struct * tsk); + +static inline int task_detached(struct task_struct *p) +{ + return p->exit_signal == -1; +} + +static void __unhash_process(struct task_struct *p) +{ + nr_threads--; + detach_pid(p, PIDTYPE_PID); + if (thread_group_leader(p)) { + detach_pid(p, PIDTYPE_PGID); + detach_pid(p, PIDTYPE_SID); + + list_del_rcu(&p->tasks); + __get_cpu_var(process_counts)--; + } + list_del_rcu(&p->thread_group); + list_del_init(&p->sibling); +} + +/* + * This function expects the tasklist_lock write-locked. + */ +static void __exit_signal(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct sighand_struct *sighand; + + BUG_ON(!sig); + BUG_ON(!atomic_read(&sig->count)); + + sighand = rcu_dereference(tsk->sighand); + spin_lock(&sighand->siglock); + + posix_cpu_timers_exit(tsk); + if (atomic_dec_and_test(&sig->count)) + posix_cpu_timers_exit_group(tsk); + else { + /* + * If there is any task waiting for the group exit + * then notify it: + */ + if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) + wake_up_process(sig->group_exit_task); + + if (tsk == sig->curr_target) + sig->curr_target = next_thread(tsk); + /* + * Accumulate here the counters for all threads but the + * group leader as they die, so they can be added into + * the process-wide totals when those are taken. + * The group leader stays around as a zombie as long + * as there are other threads. When it gets reaped, + * the exit.c code will add its counts into these totals. + * We won't ever get here for the group leader, since it + * will have been the last reference on the signal_struct. + */ + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); + sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; + sig = NULL; /* Marker for below. */ + } + + __unhash_process(tsk); + + /* + * Do this under ->siglock, we can race with another thread + * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + */ + flush_sigqueue(&tsk->pending); + + tsk->signal = NULL; + tsk->sighand = NULL; + spin_unlock(&sighand->siglock); + + __cleanup_sighand(sighand); + clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + if (sig) { + flush_sigqueue(&sig->shared_pending); + taskstats_tgid_free(sig); + /* + * Make sure ->signal can't go away under rq->lock, + * see account_group_exec_runtime(). + */ + task_rq_unlock_wait(tsk); + __cleanup_signal(sig); + } +} + +static void delayed_put_task_struct(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + trace_sched_process_free(tsk); + put_task_struct(tsk); +} + + +void release_task(struct task_struct * p) +{ + struct task_struct *leader; + int zap_leader; +repeat: + tracehook_prepare_release_task(p); + /* don't need to get the RCU readlock here - the process is dead and + * can't be modifying its own credentials */ + atomic_dec(&__task_cred(p)->user->processes); + + proc_flush_task(p); + write_lock_irq(&tasklist_lock); + tracehook_finish_release_task(p); + __exit_signal(p); + + /* + * If we are the last non-leader member of the thread + * group, and the leader is zombie, then notify the + * group leader's parent process. (if it wants notification.) + */ + zap_leader = 0; + leader = p->group_leader; + if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + BUG_ON(task_detached(leader)); + do_notify_parent(leader, leader->exit_signal); + /* + * If we were the last child thread and the leader has + * exited already, and the leader's parent ignores SIGCHLD, + * then we are the one who should release the leader. + * + * do_notify_parent() will have marked it self-reaping in + * that case. + */ + zap_leader = task_detached(leader); + + /* + * This maintains the invariant that release_task() + * only runs on a task in EXIT_DEAD, just for sanity. + */ + if (zap_leader) + leader->exit_state = EXIT_DEAD; + } + + write_unlock_irq(&tasklist_lock); + release_thread(p); + call_rcu(&p->rcu, delayed_put_task_struct); + + p = leader; + if (unlikely(zap_leader)) + goto repeat; +} + +/* + * This checks not only the pgrp, but falls back on the pid if no + * satisfactory pgrp is found. I dunno - gdb doesn't work correctly + * without this... + * + * The caller must hold rcu lock or the tasklist lock. + */ +struct pid *session_of_pgrp(struct pid *pgrp) +{ + struct task_struct *p; + struct pid *sid = NULL; + + p = pid_task(pgrp, PIDTYPE_PGID); + if (p == NULL) + p = pid_task(pgrp, PIDTYPE_PID); + if (p != NULL) + sid = task_session(p); + + return sid; +} + +/* + * Determine if a process group is "orphaned", according to the POSIX + * definition in 2.2.2.52. Orphaned process groups are not to be affected + * by terminal-generated stop signals. Newly orphaned process groups are + * to receive a SIGHUP and a SIGCONT. + * + * "I ask you, have you ever known what it is to be an orphan?" + */ +static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +{ + struct task_struct *p; + + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + if ((p == ignored_task) || + (p->exit_state && thread_group_empty(p)) || + is_global_init(p->real_parent)) + continue; + + if (task_pgrp(p->real_parent) != pgrp && + task_session(p->real_parent) == task_session(p)) + return 0; + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + + return 1; +} + +int is_current_pgrp_orphaned(void) +{ + int retval; + + read_lock(&tasklist_lock); + retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); + read_unlock(&tasklist_lock); + + return retval; +} + +static int has_stopped_jobs(struct pid *pgrp) +{ + int retval = 0; + struct task_struct *p; + + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + if (!task_is_stopped(p)) + continue; + retval = 1; + break; + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + return retval; +} + +/* + * Check to see if any process groups have become orphaned as + * a result of our exiting, and if they have any stopped jobs, + * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void +kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) +{ + struct pid *pgrp = task_pgrp(tsk); + struct task_struct *ignored_task = tsk; + + if (!parent) + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ + parent = tsk->real_parent; + else + /* reparent: our child is in a different pgrp than + * we are, and it was the only connection outside. + */ + ignored_task = NULL; + + if (task_pgrp(parent) != pgrp && + task_session(parent) == task_session(tsk) && + will_become_orphaned_pgrp(pgrp, ignored_task) && + has_stopped_jobs(pgrp)) { + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); + } +} + +/** + * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd + * + * If a kernel thread is launched as a result of a system call, or if + * it ever exits, it should generally reparent itself to kthreadd so it + * isn't in the way of other processes and is correctly cleaned up on exit. + * + * The various task state such as scheduling policy and priority may have + * been inherited from a user process, so we reset them to sane values here. + * + * NOTE that reparent_to_kthreadd() gives the caller full capabilities. + */ +static void reparent_to_kthreadd(void) +{ + write_lock_irq(&tasklist_lock); + + ptrace_unlink(current); + /* Reparent to init */ + current->real_parent = current->parent = kthreadd_task; + list_move_tail(¤t->sibling, ¤t->real_parent->children); + + /* Set the exit signal to SIGCHLD so we signal init on exit */ + current->exit_signal = SIGCHLD; + + if (task_nice(current) < 0) + set_user_nice(current, 0); + /* cpus_allowed? */ + /* rt_priority? */ + /* signals? */ + memcpy(current->signal->rlim, init_task.signal->rlim, + sizeof(current->signal->rlim)); + +#ifndef DDE_LINUX + atomic_inc(&init_cred.usage); + commit_creds(&init_cred); +#endif + write_unlock_irq(&tasklist_lock); +} + +void __set_special_pids(struct pid *pid) +{ + struct task_struct *curr = current->group_leader; + pid_t nr = pid_nr(pid); + + if (task_session(curr) != pid) { + change_pid(curr, PIDTYPE_SID, pid); + set_task_session(curr, nr); + } + if (task_pgrp(curr) != pid) { + change_pid(curr, PIDTYPE_PGID, pid); + set_task_pgrp(curr, nr); + } +} + +static void set_special_pids(struct pid *pid) +{ + write_lock_irq(&tasklist_lock); + __set_special_pids(pid); + write_unlock_irq(&tasklist_lock); +} + +/* + * Let kernel threads use this to say that they + * allow a certain signal (since daemonize() will + * have disabled all of them by default). + */ +int allow_signal(int sig) +{ + if (!valid_signal(sig) || sig < 1) + return -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + sigdelset(¤t->blocked, sig); + if (!current->mm) { + /* Kernel threads handle their own signals. + Let the signal code know it'll be handled, so + that they don't get converted to SIGKILL or + just silently dropped */ + current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; + } + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +EXPORT_SYMBOL(allow_signal); + +int disallow_signal(int sig) +{ + if (!valid_signal(sig) || sig < 1) + return -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +EXPORT_SYMBOL(disallow_signal); + +/* + * Put all the gunge required to become a kernel thread without + * attached user resources in one place where it belongs. + */ + +void daemonize(const char *name, ...) +{ + va_list args; + struct fs_struct *fs; + sigset_t blocked; + + va_start(args, name); + vsnprintf(current->comm, sizeof(current->comm), name, args); + va_end(args); + + /* + * If we were started as result of loading a module, close all of the + * user space pages. We don't need them, and if we didn't close them + * they would be locked into memory. + */ + exit_mm(current); + /* + * We don't want to have TIF_FREEZE set if the system-wide hibernation + * or suspend transition begins right now. + */ + current->flags |= (PF_NOFREEZE | PF_KTHREAD); + + if (current->nsproxy != &init_nsproxy) { + get_nsproxy(&init_nsproxy); + switch_task_namespaces(current, &init_nsproxy); + } + set_special_pids(&init_struct_pid); + proc_clear_tty(current); + + /* Block and flush all signals */ + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + /* Become as one with the init task */ + + exit_fs(current); /* current->fs->count--; */ + fs = init_task.fs; + current->fs = fs; + atomic_inc(&fs->count); + + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); + + reparent_to_kthreadd(); +} + +EXPORT_SYMBOL(daemonize); + +static void close_files(struct files_struct * files) +{ + int i, j; + struct fdtable *fdt; + + j = 0; + + /* + * It is safe to dereference the fd table without RCU or + * ->file_lock because this is the last reference to the + * files structure. + */ + fdt = files_fdtable(files); + for (;;) { + unsigned long set; + i = j * __NFDBITS; + if (i >= fdt->max_fds) + break; + set = fdt->open_fds->fds_bits[j++]; + while (set) { + if (set & 1) { + struct file * file = xchg(&fdt->fd[i], NULL); + if (file) { + filp_close(file, files); + cond_resched(); + } + } + i++; + set >>= 1; + } + } +} + +struct files_struct *get_files_struct(struct task_struct *task) +{ + struct files_struct *files; + + task_lock(task); + files = task->files; + if (files) + atomic_inc(&files->count); + task_unlock(task); + + return files; +} + +void put_files_struct(struct files_struct *files) +{ + struct fdtable *fdt; + + if (atomic_dec_and_test(&files->count)) { + close_files(files); + /* + * Free the fd and fdset arrays if we expanded them. + * If the fdtable was embedded, pass files for freeing + * at the end of the RCU grace period. Otherwise, + * you can free files immediately. + */ + fdt = files_fdtable(files); + if (fdt != &files->fdtab) + kmem_cache_free(files_cachep, files); + free_fdtable(fdt); + } +} + +void reset_files_struct(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct files_struct *old; + + old = tsk->files; + task_lock(tsk); + tsk->files = files; + task_unlock(tsk); + put_files_struct(old); +} + +void exit_files(struct task_struct *tsk) +{ + struct files_struct * files = tsk->files; + + if (files) { + task_lock(tsk); + tsk->files = NULL; + task_unlock(tsk); + put_files_struct(files); + } +} + +void put_fs_struct(struct fs_struct *fs) +{ + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { + path_put(&fs->root); + path_put(&fs->pwd); + kmem_cache_free(fs_cachep, fs); + } +} + +void exit_fs(struct task_struct *tsk) +{ + struct fs_struct * fs = tsk->fs; + + if (fs) { + task_lock(tsk); + tsk->fs = NULL; + task_unlock(tsk); + put_fs_struct(fs); + } +} + +EXPORT_SYMBOL_GPL(exit_fs); + +#ifdef CONFIG_MM_OWNER +/* + * Task p is exiting and it owned mm, lets find a new owner for it + */ +static inline int +mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) +{ + /* + * If there are other users of the mm and the owner (us) is exiting + * we need to find a new owner to take on the responsibility. + */ + if (atomic_read(&mm->mm_users) <= 1) + return 0; + if (mm->owner != p) + return 0; + return 1; +} + +void mm_update_next_owner(struct mm_struct *mm) +{ + struct task_struct *c, *g, *p = current; + +retry: + if (!mm_need_new_owner(mm, p)) + return; + + read_lock(&tasklist_lock); + /* + * Search in the children + */ + list_for_each_entry(c, &p->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search in the siblings + */ + list_for_each_entry(c, &p->parent->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search through everything else. We should not get + * here often + */ + do_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + } while_each_thread(g, c); + + read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL. + */ + mm->owner = NULL; + return; + +assign_new_owner: + BUG_ON(c == p); + get_task_struct(c); + /* + * The task_lock protects c->mm from changing. + * We always want mm->owner->mm == mm + */ + task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); + if (c->mm != mm) { + task_unlock(c); + put_task_struct(c); + goto retry; + } + mm->owner = c; + task_unlock(c); + put_task_struct(c); +} +#endif /* CONFIG_MM_OWNER */ + +/* + * Turn us into a lazy TLB process if we + * aren't already.. + */ +static void exit_mm(struct task_struct * tsk) +{ + struct mm_struct *mm = tsk->mm; + struct core_state *core_state; + + mm_release(tsk, mm); + if (!mm) + return; + /* + * Serialize with any possible pending coredump. + * We must hold mmap_sem around checking core_state + * and clearing tsk->mm. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group with ->mm != NULL. + */ + down_read(&mm->mmap_sem); + core_state = mm->core_state; + if (core_state) { + struct core_thread self; + up_read(&mm->mmap_sem); + + self.task = tsk; + self.next = xchg(&core_state->dumper.next, &self); + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); + + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_task_state(tsk, TASK_RUNNING); + down_read(&mm->mmap_sem); + } + atomic_inc(&mm->mm_count); + BUG_ON(mm != tsk->active_mm); + /* more a memory barrier than a real lock */ + task_lock(tsk); + tsk->mm = NULL; + up_read(&mm->mmap_sem); + enter_lazy_tlb(mm, current); + /* We don't want this task to be frozen prematurely */ + clear_freeze_flag(tsk); + task_unlock(tsk); + mm_update_next_owner(mm); + mmput(mm); +} + +/* + * Return nonzero if @parent's children should reap themselves. + * + * Called with write_lock_irq(&tasklist_lock) held. + */ +static int ignoring_children(struct task_struct *parent) +{ + int ret; + struct sighand_struct *psig = parent->sighand; + unsigned long flags; + spin_lock_irqsave(&psig->siglock, flags); + ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || + (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT)); + spin_unlock_irqrestore(&psig->siglock, flags); + return ret; +} + +/* + * Detach all tasks we were using ptrace on. + * Any that need to be release_task'd are put on the @dead list. + * + * Called with write_lock(&tasklist_lock) held. + */ +static void ptrace_exit(struct task_struct *parent, struct list_head *dead) +{ + struct task_struct *p, *n; + int ign = -1; + + list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { + __ptrace_unlink(p); + + if (p->exit_state != EXIT_ZOMBIE) + continue; + + /* + * If it's a zombie, our attachedness prevented normal + * parent notification or self-reaping. Do notification + * now if it would have happened earlier. If it should + * reap itself, add it to the @dead list. We can't call + * release_task() here because we already hold tasklist_lock. + * + * If it's our own child, there is no notification to do. + * But if our normal children self-reap, then this child + * was prevented by ptrace and we must reap it now. + */ + if (!task_detached(p) && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, parent)) + do_notify_parent(p, p->exit_signal); + else { + if (ign < 0) + ign = ignoring_children(parent); + if (ign) + p->exit_signal = -1; + } + } + + if (task_detached(p)) { + /* + * Mark it as in the process of being reaped. + */ + p->exit_state = EXIT_DEAD; + list_add(&p->ptrace_entry, dead); + } + } +} + +/* + * Finish up exit-time ptrace cleanup. + * + * Called without locks. + */ +static void ptrace_exit_finish(struct task_struct *parent, + struct list_head *dead) +{ + struct task_struct *p, *n; + + BUG_ON(!list_empty(&parent->ptraced)); + + list_for_each_entry_safe(p, n, dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } +} + +static void reparent_thread(struct task_struct *p, struct task_struct *father) +{ + if (p->pdeath_signal) + /* We already hold the tasklist_lock here. */ + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + + list_move_tail(&p->sibling, &p->real_parent->children); + + /* If this is a threaded reparent there is no need to + * notify anyone anything has happened. + */ + if (same_thread_group(p->real_parent, father)) + return; + + /* We don't want people slaying init. */ + if (!task_detached(p)) + p->exit_signal = SIGCHLD; + + /* If we'd notified the old parent about this child's death, + * also notify the new parent. + */ + if (!ptrace_reparented(p) && + p->exit_state == EXIT_ZOMBIE && + !task_detached(p) && thread_group_empty(p)) + do_notify_parent(p, p->exit_signal); + + kill_orphaned_pgrp(p, father); +} + +/* + * When we die, we re-parent all our children. + * Try to give them to another thread in our thread + * group, and if no such member exists, give it to + * the child reaper process (ie "init") in our pid + * space. + */ +static struct task_struct *find_new_reaper(struct task_struct *father) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *thread; + + thread = father; + while_each_thread(father, thread) { + if (thread->flags & PF_EXITING) + continue; + if (unlikely(pid_ns->child_reaper == father)) + pid_ns->child_reaper = thread; + return thread; + } + + if (unlikely(pid_ns->child_reaper == father)) { + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) + panic("Attempted to kill init!"); + + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; + } + + return pid_ns->child_reaper; +} + +static void forget_original_parent(struct task_struct *father) +{ + struct task_struct *p, *n, *reaper; + LIST_HEAD(ptrace_dead); + + write_lock_irq(&tasklist_lock); + reaper = find_new_reaper(father); + /* + * First clean up ptrace if we were using it. + */ + ptrace_exit(father, &ptrace_dead); + + list_for_each_entry_safe(p, n, &father->children, sibling) { + p->real_parent = reaper; + if (p->parent == father) { + BUG_ON(p->ptrace); + p->parent = p->real_parent; + } + reparent_thread(p, father); + } + + write_unlock_irq(&tasklist_lock); + BUG_ON(!list_empty(&father->children)); + + ptrace_exit_finish(father, &ptrace_dead); +} + +/* + * Send signals to all our closest relatives so that they know + * to properly mourn us.. + */ +static void exit_notify(struct task_struct *tsk, int group_dead) +{ + int signal; + void *cookie; + + /* + * This does two things: + * + * A. Make init inherit all the child processes + * B. Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ + forget_original_parent(tsk); + exit_task_namespaces(tsk); + + write_lock_irq(&tasklist_lock); + if (group_dead) + kill_orphaned_pgrp(tsk->group_leader, NULL); + + /* Let father know we died + * + * Thread signals are configurable, but you aren't going to use + * that to send signals to arbitary processes. + * That stops right now. + * + * If the parent exec id doesn't match the exec id we saved + * when we started then we know the parent has changed security + * domain. + * + * If our self_exec id doesn't match our parent_exec_id then + * we have changed execution domain as these two values started + * the same after a fork. + */ + if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && + (tsk->parent_exec_id != tsk->real_parent->self_exec_id || + tsk->self_exec_id != tsk->parent_exec_id) && + !capable(CAP_KILL)) + tsk->exit_signal = SIGCHLD; + + signal = tracehook_notify_death(tsk, &cookie, group_dead); + if (signal >= 0) + signal = do_notify_parent(tsk, signal); + + tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; + + /* mt-exec, de_thread() is waiting for us */ + if (thread_group_leader(tsk) && + tsk->signal->group_exit_task && + tsk->signal->notify_count < 0) + wake_up_process(tsk->signal->group_exit_task); + + write_unlock_irq(&tasklist_lock); + + tracehook_report_death(tsk, signal, cookie, group_dead); + + /* If the process is dead, release it - nobody will wait for it */ + if (signal == DEATH_REAP) + release_task(tsk); +} + +#ifdef CONFIG_DEBUG_STACK_USAGE +static void check_stack_usage(void) +{ + static DEFINE_SPINLOCK(low_water_lock); + static int lowest_to_date = THREAD_SIZE; + unsigned long *n = end_of_stack(current); + unsigned long free; + + while (*n == 0) + n++; + free = (unsigned long)n - (unsigned long)end_of_stack(current); + + if (free >= lowest_to_date) + return; + + spin_lock(&low_water_lock); + if (free < lowest_to_date) { + printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " + "left\n", + current->comm, free); + lowest_to_date = free; + } + spin_unlock(&low_water_lock); +} +#else +static inline void check_stack_usage(void) {} +#endif + +NORET_TYPE void do_exit(long code) +{ + struct task_struct *tsk = current; + int group_dead; + + profile_task_exit(tsk); + + WARN_ON(atomic_read(&tsk->fs_excl)); + + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); + + tracehook_report_exit(&code); + + /* + * We're taking recursive faults here in do_exit. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + printk(KERN_ALERT + "Fixing recursive fault but reboot is needed!\n"); + /* + * We can do this unlocked here. The futex code uses + * this flag just to verify whether the pi state + * cleanup has been done or not. In the worst case it + * loops once more. We pretend that the cleanup was + * done as there is no way to return. Either the + * OWNER_DIED bit is set by now or we push the blocked + * task into the wait for ever nirwana as well. + */ + tsk->flags |= PF_EXITPIDONE; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + } + + exit_signals(tsk); /* sets PF_EXITING */ + /* + * tsk->flags are checked in the futex code to protect against + * an exiting task cleaning up the robust pi futexes. + */ + smp_mb(); + spin_unlock_wait(&tsk->pi_lock); + + if (unlikely(in_atomic())) + printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + + acct_update_integrals(tsk); + + group_dead = atomic_dec_and_test(&tsk->signal->live); + if (group_dead) { + hrtimer_cancel(&tsk->signal->real_timer); + exit_itimers(tsk->signal); + } + acct_collect(code, group_dead); + if (group_dead) + tty_audit_exit(); + if (unlikely(tsk->audit_context)) + audit_free(tsk); + + tsk->exit_code = code; + taskstats_exit(tsk, group_dead); + + exit_mm(tsk); + + if (group_dead) + acct_process(); + trace_sched_process_exit(tsk); + + exit_sem(tsk); + exit_files(tsk); + exit_fs(tsk); + check_stack_usage(); + exit_thread(); + cgroup_exit(tsk, 1); + + if (group_dead && tsk->signal->leader) + disassociate_ctty(1); + + module_put(task_thread_info(tsk)->exec_domain->module); + if (tsk->binfmt) + module_put(tsk->binfmt->module); + + proc_exit_connector(tsk); + exit_notify(tsk, group_dead); +#ifdef CONFIG_NUMA + mpol_put(tsk->mempolicy); + tsk->mempolicy = NULL; +#endif +#ifdef CONFIG_FUTEX + /* + * This must happen late, after the PID is not + * hashed anymore: + */ + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); + if (unlikely(current->pi_state_cache)) + kfree(current->pi_state_cache); +#endif + /* + * Make sure we are holding no locks: + */ + debug_check_no_locks_held(tsk); + /* + * We can do this unlocked here. The futex code uses this flag + * just to verify whether the pi state cleanup has been done + * or not. In the worst case it loops once more. + */ + tsk->flags |= PF_EXITPIDONE; + + if (tsk->io_context) + exit_io_context(); + + if (tsk->splice_pipe) + __free_pipe_info(tsk->splice_pipe); + + preempt_disable(); + /* causes final put_task_struct in finish_task_switch(). */ + tsk->state = TASK_DEAD; + schedule(); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) + cpu_relax(); /* For when BUG is null */ +} + +EXPORT_SYMBOL_GPL(do_exit); + +#endif /* !DDE_LINUX */ + +NORET_TYPE void complete_and_exit(struct completion *comp, long code) +{ + if (comp) + complete(comp); + + do_exit(code); +} + +EXPORT_SYMBOL(complete_and_exit); + +#ifndef DDE_LINUX +SYSCALL_DEFINE1(exit, int, error_code) +{ + do_exit((error_code&0xff)<<8); +} + +/* + * Take down every thread in the group. This is called by fatal signals + * as well as by sys_exit_group (below). + */ +NORET_TYPE void +do_group_exit(int exit_code) +{ + struct signal_struct *sig = current->signal; + + BUG_ON(exit_code & 0x80); /* core dumps don't get here */ + + if (signal_group_exit(sig)) + exit_code = sig->group_exit_code; + else if (!thread_group_empty(current)) { + struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); + if (signal_group_exit(sig)) + /* Another thread got here before we took the lock. */ + exit_code = sig->group_exit_code; + else { + sig->group_exit_code = exit_code; + sig->flags = SIGNAL_GROUP_EXIT; + zap_other_threads(current); + } + spin_unlock_irq(&sighand->siglock); + } + + do_exit(exit_code); + /* NOTREACHED */ +} + +/* + * this kills every thread in the thread group. Note that any externally + * wait4()-ing process will get the correct exit code - even if this + * thread is not the thread group leader. + */ +SYSCALL_DEFINE1(exit_group, int, error_code) +{ + do_group_exit((error_code & 0xff) << 8); + /* NOTREACHED */ + return 0; +} + +static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +{ + struct pid *pid = NULL; + if (type == PIDTYPE_PID) + pid = task->pids[type].pid; + else if (type < PIDTYPE_MAX) + pid = task->group_leader->pids[type].pid; + return pid; +} + +static int eligible_child(enum pid_type type, struct pid *pid, int options, + struct task_struct *p) +{ + int err; + + if (type < PIDTYPE_MAX) { + if (task_pid_type(p, type) != pid) + return 0; + } + + /* Wait for all children (clone and not) if __WALL is set; + * otherwise, wait for clone children *only* if __WCLONE is + * set; otherwise, wait for non-clone children *only*. (Note: + * A "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD.) */ + if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) + && !(options & __WALL)) + return 0; + + err = security_task_wait(p); + if (err) + return err; + + return 1; +} + +static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, + int why, int status, + struct siginfo __user *infop, + struct rusage __user *rusagep) +{ + int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; + + put_task_struct(p); + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + if (!retval) + retval = pid; + return retval; +} + +/* + * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_zombie(struct task_struct *p, int options, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + unsigned long state; + int retval, status, traced; + pid_t pid = task_pid_vnr(p); + uid_t uid = __task_cred(p)->uid; + + if (!likely(options & WEXITED)) + return 0; + + if (unlikely(options & WNOWAIT)) { + int exit_code = p->exit_code; + int why, status; + + get_task_struct(p); + read_unlock(&tasklist_lock); + if ((exit_code & 0x7f) == 0) { + why = CLD_EXITED; + status = exit_code >> 8; + } else { + why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; + status = exit_code & 0x7f; + } + return wait_noreap_copyout(p, pid, uid, why, + status, infop, ru); + } + + /* + * Try to move the task's state to DEAD + * only one thread is allowed to do this: + */ + state = xchg(&p->exit_state, EXIT_DEAD); + if (state != EXIT_ZOMBIE) { + BUG_ON(state != EXIT_DEAD); + return 0; + } + + traced = ptrace_reparented(p); + + if (likely(!traced)) { + struct signal_struct *psig; + struct signal_struct *sig; + struct task_cputime cputime; + + /* + * The resource counters for the group leader are in its + * own task_struct. Those for dead threads in the group + * are in its signal_struct, as are those for the child + * processes it has previously reaped. All these + * accumulate in the parent's signal_struct c* fields. + * + * We don't bother to take a lock here to protect these + * p->signal fields, because they are only touched by + * __exit_signal, which runs with tasklist_lock + * write-locked anyway, and so is excluded here. We do + * need to protect the access to p->parent->signal fields, + * as other threads in the parent group can be right + * here reaping other children at the same time. + * + * We use thread_group_cputime() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. + */ + thread_group_cputime(p, &cputime); + spin_lock_irq(&p->parent->sighand->siglock); + psig = p->parent->signal; + sig = p->signal; + psig->cutime = + cputime_add(psig->cutime, + cputime_add(cputime.utime, + sig->cutime)); + psig->cstime = + cputime_add(psig->cstime, + cputime_add(cputime.stime, + sig->cstime)); + psig->cgtime = + cputime_add(psig->cgtime, + cputime_add(p->gtime, + cputime_add(sig->gtime, + sig->cgtime))); + psig->cmin_flt += + p->min_flt + sig->min_flt + sig->cmin_flt; + psig->cmaj_flt += + p->maj_flt + sig->maj_flt + sig->cmaj_flt; + psig->cnvcsw += + p->nvcsw + sig->nvcsw + sig->cnvcsw; + psig->cnivcsw += + p->nivcsw + sig->nivcsw + sig->cnivcsw; + psig->cinblock += + task_io_get_inblock(p) + + sig->inblock + sig->cinblock; + psig->coublock += + task_io_get_oublock(p) + + sig->oublock + sig->coublock; + task_io_accounting_add(&psig->ioac, &p->ioac); + task_io_accounting_add(&psig->ioac, &sig->ioac); + spin_unlock_irq(&p->parent->sighand->siglock); + } + + /* + * Now we are sure this task is interesting, and no other + * thread can reap it because we set its state to EXIT_DEAD. + */ + read_unlock(&tasklist_lock); + + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; + if (!retval && stat_addr) + retval = put_user(status, stat_addr); + if (!retval && infop) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval && infop) + retval = put_user(0, &infop->si_errno); + if (!retval && infop) { + int why; + + if ((status & 0x7f) == 0) { + why = CLD_EXITED; + status >>= 8; + } else { + why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + status &= 0x7f; + } + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(status, &infop->si_status); + } + if (!retval && infop) + retval = put_user(pid, &infop->si_pid); + if (!retval && infop) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = pid; + + if (traced) { + write_lock_irq(&tasklist_lock); + /* We dropped tasklist, ptracer could die and untrace */ + ptrace_unlink(p); + /* + * If this is not a detached task, notify the parent. + * If it's still not detached after that, don't release + * it now. + */ + if (!task_detached(p)) { + do_notify_parent(p, p->exit_signal); + if (!task_detached(p)) { + p->exit_state = EXIT_ZOMBIE; + p = NULL; + } + } + write_unlock_irq(&tasklist_lock); + } + if (p != NULL) + release_task(p); + + return retval; +} + +/* + * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_stopped(int ptrace, struct task_struct *p, + int options, struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int retval, exit_code, why; + uid_t uid = 0; /* unneeded, required by compiler */ + pid_t pid; + + if (!(options & WUNTRACED)) + return 0; + + exit_code = 0; + spin_lock_irq(&p->sighand->siglock); + + if (unlikely(!task_is_stopped_or_traced(p))) + goto unlock_sig; + + if (!ptrace && p->signal->group_stop_count > 0) + /* + * A group stop is in progress and this is the group leader. + * We won't report until all threads have stopped. + */ + goto unlock_sig; + + exit_code = p->exit_code; + if (!exit_code) + goto unlock_sig; + + if (!unlikely(options & WNOWAIT)) + p->exit_code = 0; + + /* don't need the RCU readlock here as we're holding a spinlock */ + uid = __task_cred(p)->uid; +unlock_sig: + spin_unlock_irq(&p->sighand->siglock); + if (!exit_code) + return 0; + + /* + * Now we are pretty sure this task is interesting. + * Make sure it doesn't get reaped out from under us while we + * give up the lock and then examine it below. We don't want to + * keep holding onto the tasklist_lock while we call getrusage and + * possibly take page faults for user memory. + */ + get_task_struct(p); + pid = task_pid_vnr(p); + why = ptrace ? CLD_TRAPPED : CLD_STOPPED; + read_unlock(&tasklist_lock); + + if (unlikely(options & WNOWAIT)) + return wait_noreap_copyout(p, pid, uid, + why, exit_code, + infop, ru); + + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + if (!retval && stat_addr) + retval = put_user((exit_code << 8) | 0x7f, stat_addr); + if (!retval && infop) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval && infop) + retval = put_user(0, &infop->si_errno); + if (!retval && infop) + retval = put_user((short)why, &infop->si_code); + if (!retval && infop) + retval = put_user(exit_code, &infop->si_status); + if (!retval && infop) + retval = put_user(pid, &infop->si_pid); + if (!retval && infop) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = pid; + put_task_struct(p); + + BUG_ON(!retval); + return retval; +} + +/* + * Handle do_wait work for one task in a live, non-stopped state. + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_continued(struct task_struct *p, int options, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int retval; + pid_t pid; + uid_t uid; + + if (!unlikely(options & WCONTINUED)) + return 0; + + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) + return 0; + + spin_lock_irq(&p->sighand->siglock); + /* Re-check with the lock held. */ + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { + spin_unlock_irq(&p->sighand->siglock); + return 0; + } + if (!unlikely(options & WNOWAIT)) + p->signal->flags &= ~SIGNAL_STOP_CONTINUED; + uid = __task_cred(p)->uid; + spin_unlock_irq(&p->sighand->siglock); + + pid = task_pid_vnr(p); + get_task_struct(p); + read_unlock(&tasklist_lock); + + if (!infop) { + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + put_task_struct(p); + if (!retval && stat_addr) + retval = put_user(0xffff, stat_addr); + if (!retval) + retval = pid; + } else { + retval = wait_noreap_copyout(p, pid, uid, + CLD_CONTINUED, SIGCONT, + infop, ru); + BUG_ON(retval == 0); + } + + return retval; +} + +/* + * Consider @p for a wait by @parent. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; + * then *@notask_error is 0 if @p is an eligible child, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int wait_consider_task(struct task_struct *parent, int ptrace, + struct task_struct *p, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int ret = eligible_child(type, pid, options, p); + if (!ret) + return ret; + + if (unlikely(ret < 0)) { + /* + * If we have not yet seen any eligible child, + * then let this error code replace -ECHILD. + * A permission error will give the user a clue + * to look for security policy problems, rather + * than for mysterious wait bugs. + */ + if (*notask_error) + *notask_error = ret; + } + + if (likely(!ptrace) && unlikely(p->ptrace)) { + /* + * This child is hidden by ptrace. + * We aren't allowed to see it now, but eventually we will. + */ + *notask_error = 0; + return 0; + } + + if (p->exit_state == EXIT_DEAD) + return 0; + + /* + * We don't reap group leaders with subthreads. + */ + if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) + return wait_task_zombie(p, options, infop, stat_addr, ru); + + /* + * It's stopped or running now, so it might + * later continue, exit, or stop again. + */ + *notask_error = 0; + + if (task_is_stopped_or_traced(p)) + return wait_task_stopped(ptrace, p, options, + infop, stat_addr, ru); + + return wait_task_continued(p, options, infop, stat_addr, ru); +} + +/* + * Do the work of do_wait() for one thread in the group, @tsk. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; then + * *@notask_error is 0 if there were any eligible children, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int do_wait_thread(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + list_for_each_entry(p, &tsk->children, sibling) { + /* + * Do not consider detached threads. + */ + if (!task_detached(p)) { + int ret = wait_consider_task(tsk, 0, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + } + + return 0; +} + +static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + /* + * Traditionally we see ptrace'd stopped tasks regardless of options. + */ + options |= WUNTRACED; + + list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { + int ret = wait_consider_task(tsk, 1, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + + return 0; +} + +static long do_wait(enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + DECLARE_WAITQUEUE(wait, current); + struct task_struct *tsk; + int retval; + + trace_sched_process_wait(pid); + + add_wait_queue(¤t->signal->wait_chldexit,&wait); +repeat: + /* + * If there is nothing that can match our critiera just get out. + * We will clear @retval to zero if we see any child that might later + * match our criteria, even if we are not able to reap it yet. + */ + retval = -ECHILD; + if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) + goto end; + + current->state = TASK_INTERRUPTIBLE; + read_lock(&tasklist_lock); + tsk = current; + do { + int tsk_result = do_wait_thread(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (!tsk_result) + tsk_result = ptrace_do_wait(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (tsk_result) { + /* + * tasklist_lock is unlocked and we have a final result. + */ + retval = tsk_result; + goto end; + } + + if (options & __WNOTHREAD) + break; + tsk = next_thread(tsk); + BUG_ON(tsk->signal != current->signal); + } while (tsk != current); + read_unlock(&tasklist_lock); + + if (!retval && !(options & WNOHANG)) { + retval = -ERESTARTSYS; + if (!signal_pending(current)) { + schedule(); + goto repeat; + } + } + +end: + current->state = TASK_RUNNING; + remove_wait_queue(¤t->signal->wait_chldexit,&wait); + if (infop) { + if (retval > 0) + retval = 0; + else { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!retval) + retval = put_user(0, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user(0, &infop->si_code); + if (!retval) + retval = put_user(0, &infop->si_pid); + if (!retval) + retval = put_user(0, &infop->si_uid); + if (!retval) + retval = put_user(0, &infop->si_status); + } + } + return retval; +} + +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) +{ + struct pid *pid = NULL; + enum pid_type type; + long ret; + + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) + return -EINVAL; + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) + return -EINVAL; + + switch (which) { + case P_ALL: + type = PIDTYPE_MAX; + break; + case P_PID: + type = PIDTYPE_PID; + if (upid <= 0) + return -EINVAL; + break; + case P_PGID: + type = PIDTYPE_PGID; + if (upid <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (type < PIDTYPE_MAX) + pid = find_get_pid(upid); + ret = do_wait(type, pid, options, infop, NULL, ru); + put_pid(pid); + + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(5, ret, which, upid, infop, options, ru); + return ret; +} + +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) +{ + struct pid *pid = NULL; + enum pid_type type; + long ret; + + if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| + __WNOTHREAD|__WCLONE|__WALL)) + return -EINVAL; + + if (upid == -1) + type = PIDTYPE_MAX; + else if (upid < 0) { + type = PIDTYPE_PGID; + pid = find_get_pid(-upid); + } else if (upid == 0) { + type = PIDTYPE_PGID; + pid = get_pid(task_pgrp(current)); + } else /* upid > 0 */ { + type = PIDTYPE_PID; + pid = find_get_pid(upid); + } + + ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru); + put_pid(pid); + + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(4, ret, upid, stat_addr, options, ru); + return ret; +} + +#ifdef __ARCH_WANT_SYS_WAITPID + +/* + * sys_waitpid() remains for compatibility. waitpid() should be + * implemented by calling sys_wait4() from libc.a. + */ +SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) +{ + return sys_wait4(pid, stat_addr, options, NULL); +} + +#endif +#endif diff --git a/libdde-linux26/lib/src/kernel/irq/handle.c b/libdde-linux26/lib/src/kernel/irq/handle.c new file mode 100644 index 00000000..ac7b14f8 --- /dev/null +++ b/libdde-linux26/lib/src/kernel/irq/handle.c @@ -0,0 +1,23 @@ +/* + * linux/kernel/irq/handle.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the core interrupt handling code. + * + * Detailed information is available in Documentation/DocBook/genericirq + * + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/rculist.h> +#include <linux/hash.h> + +int nr_irqs = NR_IRQS; +EXPORT_SYMBOL_GPL(nr_irqs); + diff --git a/libdde-linux26/lib/src/kernel/resource.c b/libdde-linux26/lib/src/kernel/resource.c new file mode 100644 index 00000000..3dd07a35 --- /dev/null +++ b/libdde-linux26/lib/src/kernel/resource.c @@ -0,0 +1,936 @@ +/* + * linux/kernel/resource.c + * + * Copyright (C) 1999 Linus Torvalds + * Copyright (C) 1999 Martin Mares <mj@ucw.cz> + * + * Arbitrary resource management. + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/ioport.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/fs.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/device.h> +#include <linux/pfn.h> +#include <asm/io.h> + + +struct resource ioport_resource = { + .name = "PCI IO", + .start = 0, + .end = IO_SPACE_LIMIT, + .flags = IORESOURCE_IO, +}; +EXPORT_SYMBOL(ioport_resource); + +struct resource iomem_resource = { + .name = "PCI mem", + .start = 0, + .end = -1, + .flags = IORESOURCE_MEM, +}; +EXPORT_SYMBOL(iomem_resource); + +static DEFINE_RWLOCK(resource_lock); + +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + if (p->child) + return p->child; + while (!p->sibling && p->parent) + p = p->parent; + return p->sibling; +} + +#ifdef CONFIG_PROC_FS + +enum { MAX_IORES_LEVEL = 5 }; + +static void *r_start(struct seq_file *m, loff_t *pos) + __acquires(resource_lock) +{ + struct resource *p = m->private; + loff_t l = 0; + read_lock(&resource_lock); + for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) + ; + return p; +} + +static void r_stop(struct seq_file *m, void *v) + __releases(resource_lock) +{ + read_unlock(&resource_lock); +} + +static int r_show(struct seq_file *m, void *v) +{ + struct resource *root = m->private; + struct resource *r = v, *p; + int width = root->end < 0x10000 ? 4 : 8; + int depth; + + for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) + if (p->parent == root) + break; + seq_printf(m, "%*s%0*llx-%0*llx : %s\n", + depth * 2, "", + width, (unsigned long long) r->start, + width, (unsigned long long) r->end, + r->name ? r->name : "<BAD>"); + return 0; +} + +static const struct seq_operations resource_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = r_show, +}; + +static int ioports_open(struct inode *inode, struct file *file) +{ + int res = seq_open(file, &resource_op); + if (!res) { + struct seq_file *m = file->private_data; + m->private = &ioport_resource; + } + return res; +} + +static int iomem_open(struct inode *inode, struct file *file) +{ + int res = seq_open(file, &resource_op); + if (!res) { + struct seq_file *m = file->private_data; + m->private = &iomem_resource; + } + return res; +} + +static const struct file_operations proc_ioports_operations = { + .open = ioports_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations proc_iomem_operations = { + .open = iomem_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init ioresources_init(void) +{ + proc_create("ioports", 0, NULL, &proc_ioports_operations); + proc_create("iomem", 0, NULL, &proc_iomem_operations); + return 0; +} +__initcall(ioresources_init); + +#endif /* CONFIG_PROC_FS */ + +/* Return the conflict entry if you can't request it */ +static struct resource * __request_resource(struct resource *root, struct resource *new) +{ + resource_size_t start = new->start; + resource_size_t end = new->end; + struct resource *tmp, **p; + + if (end < start) + return root; + if (start < root->start) + return root; + if (end > root->end) + return root; + p = &root->child; + for (;;) { + tmp = *p; + if (!tmp || tmp->start > end) { + new->sibling = tmp; + *p = new; + new->parent = root; + return NULL; + } + p = &tmp->sibling; + if (tmp->end < start) + continue; + return tmp; + } +} + +static int __release_resource(struct resource *old) +{ + struct resource *tmp, **p; + + p = &old->parent->child; + for (;;) { + tmp = *p; + if (!tmp) + break; + if (tmp == old) { + *p = tmp->sibling; + old->parent = NULL; + return 0; + } + p = &tmp->sibling; + } + return -EINVAL; +} + +/** + * request_resource - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, negative error code on error. + */ +int request_resource(struct resource *root, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __request_resource(root, new); + write_unlock(&resource_lock); + return conflict ? -EBUSY : 0; +} + +EXPORT_SYMBOL(request_resource); + +/** + * release_resource - release a previously reserved resource + * @old: resource pointer + */ +int release_resource(struct resource *old) +{ + int retval; + + write_lock(&resource_lock); + retval = __release_resource(old); + write_unlock(&resource_lock); + return retval; +} + +EXPORT_SYMBOL(release_resource); + +#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) +/* + * Finds the lowest memory reosurce exists within [res->start.res->end) + * the caller must specify res->start, res->end, res->flags. + * If found, returns 0, res is overwritten, if not found, returns -1. + */ +static int find_next_system_ram(struct resource *res) +{ + resource_size_t start, end; + struct resource *p; + + BUG_ON(!res); + + start = res->start; + end = res->end; + BUG_ON(start >= end); + + read_lock(&resource_lock); + for (p = iomem_resource.child; p ; p = p->sibling) { + /* system ram is just marked as IORESOURCE_MEM */ + if (p->flags != res->flags) + continue; + if (p->start > end) { + p = NULL; + break; + } + if ((p->end >= start) && (p->start < end)) + break; + } + read_unlock(&resource_lock); + if (!p) + return -1; + /* copy data */ + if (res->start < p->start) + res->start = p->start; + if (res->end > p->end) + res->end = p->end; + return 0; +} +int +walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, + int (*func)(unsigned long, unsigned long, void *)) +{ + struct resource res; + unsigned long pfn, len; + u64 orig_end; + int ret = -1; + res.start = (u64) start_pfn << PAGE_SHIFT; + res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { + pfn = (unsigned long)(res.start >> PAGE_SHIFT); + len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); + ret = (*func)(pfn, len, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#endif + +/* + * Find empty slot in the resource tree given range and alignment. + */ +static int find_resource(struct resource *root, struct resource *new, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, + void (*alignf)(void *, struct resource *, + resource_size_t, resource_size_t), + void *alignf_data) +{ + struct resource *this = root->child; + + new->start = root->start; + /* + * Skip past an allocated resource that starts at 0, since the assignment + * of this->start - 1 to new->end below would cause an underflow. + */ + if (this && this->start == 0) { + new->start = this->end + 1; + this = this->sibling; + } + for(;;) { + if (this) + new->end = this->start - 1; + else + new->end = root->end; + if (new->start < min) + new->start = min; + if (new->end > max) + new->end = max; + new->start = ALIGN(new->start, align); + if (alignf) + alignf(alignf_data, new, size, align); + if (new->start < new->end && new->end - new->start >= size - 1) { + new->end = new->start + size - 1; + return 0; + } + if (!this) + break; + new->start = this->end + 1; + this = this->sibling; + } + return -EBUSY; +} + +/** + * allocate_resource - allocate empty slot in the resource tree given range & alignment + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * @size: requested resource region size + * @min: minimum size to allocate + * @max: maximum size to allocate + * @align: alignment requested, in bytes + * @alignf: alignment function, optional, called if not NULL + * @alignf_data: arbitrary data to pass to the @alignf function + */ +int allocate_resource(struct resource *root, struct resource *new, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, + void (*alignf)(void *, struct resource *, + resource_size_t, resource_size_t), + void *alignf_data) +{ + int err; + + write_lock(&resource_lock); + err = find_resource(root, new, size, min, max, align, alignf, alignf_data); + if (err >= 0 && __request_resource(root, new)) + err = -EBUSY; + write_unlock(&resource_lock); + return err; +} + +EXPORT_SYMBOL(allocate_resource); + +/* + * Insert a resource into the resource tree. If successful, return NULL, + * otherwise return the conflicting resource (compare to __request_resource()) + */ +static struct resource * __insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *first, *next; + + for (;; parent = first) { + first = __request_resource(parent, new); + if (!first) + return first; + + if (first == parent) + return first; + + if ((first->start > new->start) || (first->end < new->end)) + break; + if ((first->start == new->start) && (first->end == new->end)) + break; + } + + for (next = first; ; next = next->sibling) { + /* Partial overlap? Bad, and unfixable */ + if (next->start < new->start || next->end > new->end) + return next; + if (!next->sibling) + break; + if (next->sibling->start > new->end) + break; + } + + new->parent = parent; + new->sibling = next->sibling; + new->child = first; + + next->sibling = NULL; + for (next = first; next; next = next->sibling) + next->parent = new; + + if (parent->child == first) { + parent->child = new; + } else { + next = parent->child; + while (next->sibling != first) + next = next->sibling; + next->sibling = new; + } + return NULL; +} + +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is equivalent to request_resource when no conflict + * happens. If a conflict happens, and the conflicting resources + * entirely fit within the range of the new resource, then the new + * resource is inserted and the conflicting resources become children of + * the new resource. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __insert_resource(parent, new); + write_unlock(&resource_lock); + return conflict ? -EBUSY : 0; +} + +/** + * insert_resource_expand_to_fit - Insert a resource into the resource tree + * @root: root resource descriptor + * @new: new resource to insert + * + * Insert a resource into the resource tree, possibly expanding it in order + * to make it encompass any conflicting resources. + */ +void insert_resource_expand_to_fit(struct resource *root, struct resource *new) +{ + if (new->parent) + return; + + write_lock(&resource_lock); + for (;;) { + struct resource *conflict; + + conflict = __insert_resource(root, new); + if (!conflict) + break; + if (conflict == root) + break; + + /* Ok, expand resource to cover the conflict, then try again .. */ + if (conflict->start < new->start) + new->start = conflict->start; + if (conflict->end > new->end) + new->end = conflict->end; + + printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); + } + write_unlock(&resource_lock); +} + +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * + * Given an existing resource, change its start and size to match the + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) +{ + struct resource *tmp, *parent = res->parent; + resource_size_t end = start + size - 1; + int result = -EBUSY; + + write_lock(&resource_lock); + + if ((start < parent->start) || (end > parent->end)) + goto out; + + for (tmp = res->child; tmp; tmp = tmp->sibling) { + if ((tmp->start < start) || (tmp->end > end)) + goto out; + } + + if (res->sibling && (res->sibling->start <= end)) + goto out; + + tmp = parent->child; + if (tmp != res) { + while (tmp->sibling != res) + tmp = tmp->sibling; + if (start <= tmp->end) + goto out; + } + + res->start = start; + res->end = end; + result = 0; + + out: + write_unlock(&resource_lock); + return result; +} + +static void __init __reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + struct resource *parent = root; + struct resource *conflict; + struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + + if (!res) + return; + + res->name = name; + res->start = start; + res->end = end; + res->flags = IORESOURCE_BUSY; + + for (;;) { + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + + if (!res) { + /* failed, split and try again */ + + /* conflict covered whole area */ + if (conflict->start <= start && conflict->end >= end) + return; + + if (conflict->start > start) + __reserve_region_with_split(root, start, conflict->start-1, name); + if (!(conflict->flags & IORESOURCE_BUSY)) { + resource_size_t common_start, common_end; + + common_start = max(conflict->start, start); + common_end = min(conflict->end, end); + if (common_start < common_end) + __reserve_region_with_split(root, common_start, common_end, name); + } + if (conflict->end < end) + __reserve_region_with_split(root, conflict->end+1, end, name); + } + +} + +void __init reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + write_lock(&resource_lock); + __reserve_region_with_split(root, start, end, name); + write_unlock(&resource_lock); +} + +EXPORT_SYMBOL(adjust_resource); + +/** + * resource_alignment - calculate resource's alignment + * @res: resource pointer + * + * Returns alignment on success, 0 (invalid alignment) on failure. + */ +resource_size_t resource_alignment(struct resource *res) +{ + switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { + case IORESOURCE_SIZEALIGN: + return resource_size(res); + case IORESOURCE_STARTALIGN: + return res->start; + default: + return 0; + } +} + +/* + * This is compatibility stuff for IO resources. + * + * Note how this, unlike the above, knows about + * the IO flag meanings (busy etc). + * + * request_region creates a new busy region. + * + * check_region returns non-zero if the area is already busy. + * + * release_region releases a matching busy region. + */ + +#ifndef DDE_LINUX +/** + * __request_region - create a new busy resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * @name: reserving caller's ID string + * @flags: IO resource flags + */ +struct resource * __request_region(struct resource *parent, + resource_size_t start, resource_size_t n, + const char *name, int flags) +{ + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + + if (!res) + return NULL; + + res->name = name; + res->start = start; + res->end = start + n - 1; + res->flags = IORESOURCE_BUSY; + res->flags |= flags; + + write_lock(&resource_lock); + + for (;;) { + struct resource *conflict; + + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + write_unlock(&resource_lock); + return res; +} +EXPORT_SYMBOL(__request_region); + +/** + * __check_region - check if a resource region is busy or free + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * Returns 0 if the region is free at the moment it is checked, + * returns %-EBUSY if the region is busy. + * + * NOTE: + * This function is deprecated because its use is racy. + * Even if it returns 0, a subsequent call to request_region() + * may fail because another driver etc. just allocated the region. + * Do NOT use it. It will be removed from the kernel. + */ +int __check_region(struct resource *parent, resource_size_t start, + resource_size_t n) +{ + struct resource * res; + + res = __request_region(parent, start, n, "check-region", 0); + if (!res) + return -EBUSY; + + release_resource(res); + kfree(res); + return 0; +} +EXPORT_SYMBOL(__check_region); + +/** + * __release_region - release a previously reserved resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * The described resource region must match a currently busy region. + */ +void __release_region(struct resource *parent, resource_size_t start, + resource_size_t n) +{ + struct resource **p; + resource_size_t end; + + p = &parent->child; + end = start + n - 1; + + write_lock(&resource_lock); + + for (;;) { + struct resource *res = *p; + + if (!res) + break; + if (res->start <= start && res->end >= end) { + if (!(res->flags & IORESOURCE_BUSY)) { + p = &res->child; + continue; + } + if (res->start != start || res->end != end) + break; + *p = res->sibling; + write_unlock(&resource_lock); + kfree(res); + return; + } + p = &res->sibling; + } + + write_unlock(&resource_lock); + + printk(KERN_WARNING "Trying to free nonexistent resource " + "<%016llx-%016llx>\n", (unsigned long long)start, + (unsigned long long)end); +} +EXPORT_SYMBOL(__release_region); +#endif /* DDE_LINUX */ + +/* + * Managed region resource + */ +struct region_devres { + struct resource *parent; + resource_size_t start; + resource_size_t n; +}; + +static void devm_region_release(struct device *dev, void *res) +{ + struct region_devres *this = res; + + __release_region(this->parent, this->start, this->n); +} + +static int devm_region_match(struct device *dev, void *res, void *match_data) +{ + struct region_devres *this = res, *match = match_data; + + return this->parent == match->parent && + this->start == match->start && this->n == match->n; +} + +struct resource * __devm_request_region(struct device *dev, + struct resource *parent, resource_size_t start, + resource_size_t n, const char *name) +{ + struct region_devres *dr = NULL; + struct resource *res; + + dr = devres_alloc(devm_region_release, sizeof(struct region_devres), + GFP_KERNEL); + if (!dr) + return NULL; + + dr->parent = parent; + dr->start = start; + dr->n = n; + + res = __request_region(parent, start, n, name, 0); + if (res) + devres_add(dev, dr); + else + devres_free(dr); + + return res; +} +EXPORT_SYMBOL(__devm_request_region); + +void __devm_release_region(struct device *dev, struct resource *parent, + resource_size_t start, resource_size_t n) +{ + struct region_devres match_data = { parent, start, n }; + + __release_region(parent, start, n); + WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, + &match_data)); +} +EXPORT_SYMBOL(__devm_release_region); + +/* + * Called from init/main.c to reserve IO ports. + */ +#define MAXRESERVE 4 +static int __init reserve_setup(char *str) +{ + static int reserved; + static struct resource reserve[MAXRESERVE]; + + for (;;) { + int io_start, io_num; + int x = reserved; + + if (get_option (&str, &io_start) != 2) + break; + if (get_option (&str, &io_num) == 0) + break; + if (x < MAXRESERVE) { + struct resource *res = reserve + x; + res->name = "reserved"; + res->start = io_start; + res->end = io_start + io_num - 1; + res->flags = IORESOURCE_BUSY; + res->child = NULL; + if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) + reserved = x+1; + } + } + return 1; +} + +__setup("reserve=", reserve_setup); + +/* + * Check if the requested addr and size spans more than any slot in the + * iomem resource tree. + */ +int iomem_map_sanity_check(resource_size_t addr, unsigned long size) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + continue; + if (p->end < addr) + continue; + if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && + PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) + continue; + /* + * if a resource is "BUSY", it's not a hardware resource + * but a driver mapping of such a resource; we don't want + * to warn for those; some drivers legitimately map only + * partial hardware resources. (example: vesafb) + */ + if (p->flags & IORESOURCE_BUSY) + continue; + + printk(KERN_WARNING "resource map sanity check conflict: " + "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + (unsigned long long)addr, + (unsigned long long)(addr + size - 1), + (unsigned long long)p->start, + (unsigned long long)p->end, + p->name); + err = -1; + break; + } + read_unlock(&resource_lock); + + return err; +} + +#ifdef CONFIG_STRICT_DEVMEM +static int strict_iomem_checks = 1; +#else +static int strict_iomem_checks; +#endif + +/* + * check if an address is reserved in the iomem resource tree + * returns 1 if reserved, 0 if not reserved. + */ +int iomem_is_exclusive(u64 addr) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + int size = PAGE_SIZE; + + if (!strict_iomem_checks) + return 0; + + addr = addr & PAGE_MASK; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + break; + if (p->end < addr) + continue; + if (p->flags & IORESOURCE_BUSY && + p->flags & IORESOURCE_EXCLUSIVE) { + err = 1; + break; + } + } + read_unlock(&resource_lock); + + return err; +} + +static int __init strict_iomem(char *str) +{ + if (strstr(str, "relaxed")) + strict_iomem_checks = 0; + if (strstr(str, "strict")) + strict_iomem_checks = 1; + return 1; +} + +__setup("iomem=", strict_iomem); diff --git a/libdde-linux26/lib/src/kernel/sched.c b/libdde-linux26/lib/src/kernel/sched.c new file mode 100644 index 00000000..5c51695e --- /dev/null +++ b/libdde-linux26/lib/src/kernel/sched.c @@ -0,0 +1,9654 @@ +/* + * kernel/sched.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin + * 2007-04-15 Work begun on replacing all interactivity tuning with a + * fair scheduling design by Con Kolivas. + * 2007-05-05 Load balancing (smp-nice) and other improvements + * by Peter Williams + * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith + * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri + * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, + * Thomas Gleixner, Mike Kravetz + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <linux/uaccess.h> +#include <linux/highmem.h> +#include <linux/smp_lock.h> +#include <asm/mmu_context.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/completion.h> +#include <linux/kernel_stat.h> +#include <linux/debug_locks.h> +#include <linux/security.h> +#include <linux/notifier.h> +#include <linux/profile.h> +#include <linux/freezer.h> +#include <linux/vmalloc.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/pid_namespace.h> +#include <linux/smp.h> +#include <linux/threads.h> +#include <linux/timer.h> +#include <linux/rcupdate.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/percpu.h> +#include <linux/kthread.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/sysctl.h> +#include <linux/syscalls.h> +#include <linux/times.h> +#include <linux/tsacct_kern.h> +#include <linux/kprobes.h> +#include <linux/delayacct.h> +#include <linux/reciprocal_div.h> +#include <linux/unistd.h> +#include <linux/pagemap.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/bootmem.h> +#include <linux/debugfs.h> +#include <linux/ctype.h> +#include <linux/ftrace.h> +#include <trace/sched.h> + +#include <asm/tlb.h> +#include <asm/irq_regs.h> + +#include "sched_cpupri.h" + +#ifdef DDE_LINUX +/* DDE_LINUX implements this function externally */ +extern int try_to_wake_up(struct task_struct *p, unsigned int state, int sync); +#endif + +/** DDE only uses small parts of this. */ +#ifndef DDE_LINUX +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); +} + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +#define NICE_0_LOAD SCHED_LOAD_SCALE +#define NICE_0_SHIFT SCHED_LOAD_SHIFT + +/* + * These are the 'tuning knobs' of the scheduler: + * + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define DEF_TIMESLICE (100 * HZ / 1000) + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +DEFINE_TRACE(sched_wait_task); +DEFINE_TRACE(sched_wakeup); +DEFINE_TRACE(sched_wakeup_new); +DEFINE_TRACE(sched_switch); +DEFINE_TRACE(sched_migrate_task); + +#ifdef CONFIG_SMP + +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + +/* + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) + * Since cpu_power is a 'constant', we can use a reciprocal divide. + */ +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +{ + return reciprocal_divide(load, sg->reciprocal_cpu_power); +} + +/* + * Each time a sched group cpu_power is changed, + * we must compute its reciprocal value + */ +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) +{ + sg->__cpu_power += val; + sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); +} +#endif + +static inline int rt_policy(int policy) +{ + if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) + return 1; + return 0; +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +static struct rt_bandwidth def_rt_bandwidth; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; +} + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + ktime_t now; + + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + spin_lock(&rt_b->rt_runtime_lock); + for (;;) { + if (hrtimer_active(&rt_b->rt_period_timer)) + break; + + now = hrtimer_cb_get_time(&rt_b->rt_period_timer); + hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); + hrtimer_start_expires(&rt_b->rt_period_timer, + HRTIMER_MODE_ABS); + } + spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} +#endif + +/* + * sched_domains_mutex serializes calls to arch_init_sched_domains, + * detach_destroy_domains and partition_sched_domains. + */ +static DEFINE_MUTEX(sched_domains_mutex); + +#ifdef CONFIG_GROUP_SCHED + +#include <linux/cgroup.h> + +struct cfs_rq; + +static LIST_HEAD(task_groups); + +/* task group related information */ +struct task_group { +#ifdef CONFIG_CGROUP_SCHED + struct cgroup_subsys_state css; +#endif + +#ifdef CONFIG_USER_SCHED + uid_t uid; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; +}; + +#ifdef CONFIG_USER_SCHED + +/* Helper function to pass uid information to create_sched_user() */ +void set_tg_uid(struct user_struct *user) +{ + user->tg->uid = user->uid; +} + +/* + * Root task group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. + */ +struct task_group root_task_group; + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* Default task group's sched entity on each cpu */ +static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); +/* Default task group's cfs_rq on each cpu */ +static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +#endif /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ +#define root_task_group init_task_group +#endif /* CONFIG_USER_SCHED */ + +/* task_group_lock serializes add/remove of task groups and also changes to + * a task group's cpu shares. + */ +static DEFINE_SPINLOCK(task_group_lock); + +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_USER_SCHED +# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) +#else /* !CONFIG_USER_SCHED */ +# define INIT_TASK_GROUP_LOAD NICE_0_LOAD +#endif /* CONFIG_USER_SCHED */ + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES 2 +#define MAX_SHARES (1UL << 18) + +static int init_task_group_load = INIT_TASK_GROUP_LOAD; +#endif + +/* Default task group. + * Every task in system belong to this group at bootup. + */ +struct task_group init_task_group; + +/* return group to which a task belongs */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct task_group *tg; + +#ifdef CONFIG_USER_SCHED + rcu_read_lock(); + tg = __task_cred(p)->user->tg; + rcu_read_unlock(); +#elif defined(CONFIG_CGROUP_SCHED) + tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), + struct task_group, css); +#else + tg = &init_task_group; +#endif + return tg; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; +#endif +} + +#else + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_GROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long nr_running; + + u64 exec_clock; + u64 min_vruntime; + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr, *next, *last; + + unsigned int nr_spread_over; + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + /* + * the part of load.weight contributed by tasks + */ + unsigned long task_weight; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + + /* + * this cpu's part of tg->shares + */ + unsigned long shares; + + /* + * load.weight at the time we set shares + */ + unsigned long rq_weight; +#endif +#endif +}; + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned long rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + int highest_prio; /* highest queued rt task prio */ +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + int overloaded; +#endif + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct list_head leaf_rt_rq_list; + struct task_group *tg; + struct sched_rt_entity *rt_se; +#endif +}; + +#ifdef CONFIG_SMP + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + cpumask_var_t span; + cpumask_var_t online; + + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + atomic_t rto_count; +#ifdef CONFIG_SMP + struct cpupri cpupri; +#endif +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Preferred wake up cpu nominated by sched_mc balance that will be + * used when most cpus are idle in the system indicating overall very + * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) + */ + unsigned int sched_mc_preferred_wakeup_cpu; +#endif +}; + +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +static struct root_domain def_root_domain; + +#endif + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned char idle_at_tick; +#ifdef CONFIG_NO_HZ + unsigned long last_tick_seen; + unsigned char in_nohz_recently; +#endif + /* capture load from *all* tasks on this cpu: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; + struct rt_rq rt; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this cpu: */ + struct list_head leaf_cfs_rq_list; +#endif +#ifdef CONFIG_RT_GROUP_SCHED + struct list_head leaf_rt_rq_list; +#endif + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr, *idle; + unsigned long next_balance; + struct mm_struct *prev_mm; + + u64 clock; + + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + /* cpu of this runqueue: */ + int cpu; + int online; + + unsigned long avg_load_per_task; + + struct task_struct *migration_thread; + struct list_head migration_queue; +#endif + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif + struct hrtimer hrtick_timer; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_exp_empty; + unsigned int yld_act_empty; + unsigned int yld_both_empty; + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_switch; + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; + + /* BKL stats */ + unsigned int bkl_count; +#endif +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) +{ + rq->curr->sched_class->check_preempt_curr(rq, p, sync); +} + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +static inline void update_rq_clock(struct rq *rq) +{ + rq->clock = sched_clock_cpu(cpu_of(rq)); +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# define const_debug __read_mostly +#else +# define const_debug static const +#endif + +/** + * runqueue_is_locked + * + * Returns true if the current cpu runqueue is locked. + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(void) +{ + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + int ret; + + ret = spin_is_locked(&rq->lock); + put_cpu(); + return ret; +} + +/* + * Debugging: various feature bits + */ + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "sched_features.h" +}; + +#undef SCHED_FEAT + +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | + +const_debug unsigned int sysctl_sched_features = +#include "sched_features.h" + 0; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled) \ + #name , + +static __read_mostly char *sched_feat_names[] = { +#include "sched_features.h" + NULL +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; sched_feat_names[i]; i++) { + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; sched_feat_names[i]; i++) { + int len = strlen(sched_feat_names[i]); + + if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (neg) + sysctl_sched_features &= ~(1UL << i); + else + sysctl_sched_features |= (1UL << i); + break; + } + } + + if (!sched_feat_names[i]) + return -EINVAL; + + filp->f_pos += cnt; + + return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + +static struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#endif + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +/* + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +const_debug unsigned int sysctl_sched_nr_migrate = 32; + +/* + * ratelimit for updating the group shares. + * default: 0.25ms + */ +unsigned int sysctl_sched_shares_ratelimit = 250000; + +/* + * Inject some fuzzyness into changing the per-cpu group shares + * this avoids remote rq-locks at the expense of fairness. + * default: 4 + */ +unsigned int sysctl_sched_shares_thresh = 4; + +/* + * period over which we measure -rt task cpu usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +static __read_mostly int scheduler_running; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(struct rq *rq, struct task_struct *p) +{ + return task_current(rq, p); +} + +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return task_current(rq, p); +#endif +} + +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + spin_unlock_irq(&rq->lock); +#else + spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + +/* + * __task_rq_lock - lock the runqueue a given task resides on. + * Must be called interrupts disabled. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + for (;;) { + struct rq *rq = task_rq(p); + spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; + spin_unlock(&rq->lock); + } +} + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; + spin_unlock_irqrestore(&rq->lock, *flags); + } +} + +void task_rq_unlock_wait(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + smp_mb(); /* spin-unlock-wait is not a full memory barrier */ + spin_unlock_wait(&rq->lock); +} + +static void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + spin_unlock(&rq->lock); +} + +static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) + __releases(rq->lock) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +/* + * this_rq_lock - lock this runqueue and disable interrupts. + */ +static struct rq *this_rq_lock(void) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +#ifdef CONFIG_SCHED_HRTICK +/* + * Use HR-timers to deliver accurate preemption points. + * + * Its all a bit involved since we cannot program an hrt while holding the + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a + * reschedule event. + * + * When we get rescheduled we reprogram the hrtick_timer outside of the + * rq->lock. + */ + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +static void hrtick_clear(struct rq *rq) +{ + if (hrtimer_active(&rq->hrtick_timer)) + hrtimer_cancel(&rq->hrtick_timer); +} + +/* + * High-resolution timer tick. + * Runs from hardirq context with interrupts disabled. + */ +static enum hrtimer_restart hrtick(struct hrtimer *timer) +{ + struct rq *rq = container_of(timer, struct rq, hrtick_timer); + + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + + spin_lock(&rq->lock); + update_rq_clock(rq); + rq->curr->sched_class->task_tick(rq, rq->curr, 1); + spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +#ifdef CONFIG_SMP +/* + * called from hardirq (IPI) context + */ +static void __hrtick_start(void *arg) +{ + struct rq *rq = arg; + + spin_lock(&rq->lock); + hrtimer_restart(&rq->hrtick_timer); + rq->hrtick_csd_pending = 0; + spin_unlock(&rq->lock); +} + +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) +{ + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + + hrtimer_set_expires(timer, time); + + if (rq == this_rq()) { + hrtimer_restart(timer); + } else if (!rq->hrtick_csd_pending) { + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); + rq->hrtick_csd_pending = 1; + } +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + hrtick_clear(cpu_rq(cpu)); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static __init void init_hrtick(void) +{ + hotcpu_notifier(hotplug_hrtick, 0); +} +#else +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) +{ + hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SMP */ + +static void init_rq_hrtick(struct rq *rq) +{ +#ifdef CONFIG_SMP + rq->hrtick_csd_pending = 0; + + rq->hrtick_csd.flags = 0; + rq->hrtick_csd.func = __hrtick_start; + rq->hrtick_csd.info = rq; +#endif + + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; +} +#else /* CONFIG_SCHED_HRTICK */ +static inline void hrtick_clear(struct rq *rq) +{ +} + +static inline void init_rq_hrtick(struct rq *rq) +{ +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SCHED_HRTICK */ + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + +static void resched_task(struct task_struct *p) +{ + int cpu; + + assert_spin_locked(&task_rq(p)->lock); + + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + return; + + set_tsk_thread_flag(p, TIF_NEED_RESCHED); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); +} + +static void resched_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + spin_unlock_irqrestore(&rq->lock, flags); +} + +#ifdef CONFIG_NO_HZ +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} +#endif /* CONFIG_NO_HZ */ + +#else /* !CONFIG_SMP */ +static void resched_task(struct task_struct *p) +{ + assert_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); +} +#endif /* CONFIG_SMP */ + +#if BITS_PER_LONG == 32 +# define WMULT_CONST (~0UL) +#else +# define WMULT_CONST (1UL << 32) +#endif + +#define WMULT_SHIFT 32 + +/* + * Shift right and round: + */ +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +/* + * delta *= weight / lw + */ +static unsigned long +calc_delta_mine(unsigned long delta_exec, unsigned long weight, + struct load_weight *lw) +{ + u64 tmp; + + if (!lw->inv_weight) { + if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + lw->inv_weight = 1; + else + lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) + / (lw->weight+1); + } + + tmp = (u64)delta_exec * weight; + /* + * Check whether we'd overflow the 64-bit multiplication: + */ + if (unlikely(tmp > WMULT_CONST)) + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); +} + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +static const int prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +static const u32 prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; + +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); + +/* + * runqueue iterator, to support SMP load-balancing between different + * scheduling classes, without having to expose their internal data + * structures to the load-balancing proper: + */ +struct rq_iterator { + void *arg; + struct task_struct *(*start)(void *); + struct task_struct *(*next)(void *); +}; + +#ifdef CONFIG_SMP +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + int *this_best_prio, struct rq_iterator *iterator); + +static int +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle, + struct rq_iterator *iterator); +#endif + +#ifdef CONFIG_CGROUP_CPUACCT +static void cpuacct_charge(struct task_struct *tsk, u64 cputime); +#else +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +#endif + +static inline void inc_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_add(&rq->load, load); +} + +static inline void dec_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_sub(&rq->load, load); +} + +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +typedef int (*tg_visitor)(struct task_group *, void *); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + */ +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + struct task_group *parent, *child; + int ret; + + rcu_read_lock(); + parent = &root_task_group; +down: + ret = (*down)(parent, data); + if (ret) + goto out_unlock; + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + ret = (*up)(parent, data); + if (ret) + goto out_unlock; + + child = parent; + parent = parent->parent; + if (parent) + goto up; +out_unlock: + rcu_read_unlock(); + + return ret; +} + +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + + if (nr_running) + rq->avg_load_per_task = rq->load.weight / nr_running; + else + rq->avg_load_per_task = 0; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED + +static void __set_se_shares(struct sched_entity *se, unsigned long shares); + +/* + * Calculate and set the cpu's group shares. + */ +static void +update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, unsigned long sd_rq_weight) +{ + unsigned long shares; + unsigned long rq_weight; + + if (!tg->se[cpu]) + return; + + rq_weight = tg->cfs_rq[cpu]->rq_weight; + + /* + * \Sum shares * rq_weight + * shares = ----------------------- + * \Sum rq_weight + * + */ + shares = (sd_shares * rq_weight) / sd_rq_weight; + shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); + + if (abs(shares - tg->se[cpu]->load.weight) > + sysctl_sched_shares_thresh) { + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + tg->cfs_rq[cpu]->shares = shares; + + __set_se_shares(tg->se[cpu], shares); + spin_unlock_irqrestore(&rq->lock, flags); + } +} + +/* + * Re-compute the task group their per cpu shares over the given domain. + * This needs to be done in a bottom-up fashion because the rq weight of a + * parent group depends on the shares of its child groups. + */ +static int tg_shares_up(struct task_group *tg, void *data) +{ + unsigned long weight, rq_weight = 0; + unsigned long shares = 0; + struct sched_domain *sd = data; + int i; + + for_each_cpu(i, sched_domain_span(sd)) { + /* + * If there are currently no tasks on the cpu pretend there + * is one of average load so that when a new task gets to + * run here it will not get delayed by group starvation. + */ + weight = tg->cfs_rq[i]->load.weight; + if (!weight) + weight = NICE_0_LOAD; + + tg->cfs_rq[i]->rq_weight = weight; + rq_weight += weight; + shares += tg->cfs_rq[i]->shares; + } + + if ((!shares && rq_weight) || shares > tg->shares) + shares = tg->shares; + + if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) + shares = tg->shares; + + for_each_cpu(i, sched_domain_span(sd)) + update_group_shares_cpu(tg, i, shares, rq_weight); + + return 0; +} + +/* + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. + */ +static int tg_load_down(struct task_group *tg, void *data) +{ + unsigned long load; + long cpu = (long)data; + + if (!tg->parent) { + load = cpu_rq(cpu)->load.weight; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load *= tg->cfs_rq[cpu]->shares; + load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + } + + tg->cfs_rq[cpu]->h_load = load; + + return 0; +} + +static void update_shares(struct sched_domain *sd) +{ + u64 now = cpu_clock(raw_smp_processor_id()); + s64 elapsed = now - sd->last_update; + + if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { + sd->last_update = now; + walk_tg_tree(tg_nop, tg_shares_up, sd); + } +} + +static void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ + spin_unlock(&rq->lock); + update_shares(sd); + spin_lock(&rq->lock); +} + +static void update_h_load(long cpu) +{ + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); +} + +#else + +static inline void update_shares(struct sched_domain *sd) +{ +} + +static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ +} + +#endif + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); + ret = 1; + } else + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); + } + return ret; +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ +#ifdef CONFIG_SMP + cfs_rq->shares = shares; +#endif +} +#endif + +#include "sched_stats.h" +#include "sched_idletask.c" +#include "sched_fair.c" +#include "sched_rt.c" +#ifdef CONFIG_SCHED_DEBUG +# include "sched_debug.c" +#endif + +#define sched_class_highest (&rt_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) + +static void inc_nr_running(struct rq *rq) +{ + rq->nr_running++; +} + +static void dec_nr_running(struct rq *rq) +{ + rq->nr_running--; +} + +static void set_load_weight(struct task_struct *p) +{ + if (task_has_rt_policy(p)) { + p->se.load.weight = prio_to_weight[0] * 2; + p->se.load.inv_weight = prio_to_wmult[0] >> 1; + return; + } + + /* + * SCHED_IDLE tasks get minimal weight: + */ + if (p->policy == SCHED_IDLE) { + p->se.load.weight = WEIGHT_IDLEPRIO; + p->se.load.inv_weight = WMULT_IDLEPRIO; + return; + } + + p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; + p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; +} + +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + +static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) +{ + sched_info_queued(p); + p->sched_class->enqueue_task(rq, p, wakeup); + p->se.on_rq = 1; +} + +static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) +{ + if (sleep && p->se.last_wakeup) { + update_avg(&p->se.avg_overlap, + p->se.sum_exec_runtime - p->se.last_wakeup); + p->se.last_wakeup = 0; + } + + sched_info_dequeued(p); + p->sched_class->dequeue_task(rq, p, sleep); + p->se.on_rq = 0; +} + +/* + * __normal_prio - return the priority that is based on the static prio + */ +static inline int __normal_prio(struct task_struct *p) +{ + return p->static_prio; +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(struct task_struct *p) +{ + int prio; + + if (task_has_rt_policy(p)) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(struct task_struct *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/* + * activate_task - move a task to the runqueue. + */ +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; + + enqueue_task(rq, p, wakeup); + inc_nr_running(rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible++; + + dequeue_task(rq, p, sleep); + dec_nr_running(rq); +} + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +inline int task_curr(const struct task_struct *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} + +static inline void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio, int running) +{ + if (prev_class != p->sched_class) { + if (prev_class->switched_from) + prev_class->switched_from(rq, p, running); + p->sched_class->switched_to(rq, p, running); + } else + p->sched_class->prio_changed(rq, p, oldprio, running); +} + +#ifdef CONFIG_SMP + +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Is this task likely cache-hot: + */ +static int +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +{ + s64 delta; + + /* + * Buddy candidates are cache hot: + */ + if (sched_feat(CACHE_HOT_BUDDY) && + (&p->se == cfs_rq_of(&p->se)->next || + &p->se == cfs_rq_of(&p->se)->last)) + return 1; + + if (p->sched_class != &fair_sched_class) + return 0; + + if (sysctl_sched_migration_cost == -1) + return 1; + if (sysctl_sched_migration_cost == 0) + return 0; + + delta = now - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +} + + +void set_task_cpu(struct task_struct *p, unsigned int new_cpu) +{ + int old_cpu = task_cpu(p); + struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); + struct cfs_rq *old_cfsrq = task_cfs_rq(p), + *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); + u64 clock_offset; + + clock_offset = old_rq->clock - new_rq->clock; + + trace_sched_migrate_task(p, task_cpu(p), new_cpu); + +#ifdef CONFIG_SCHEDSTATS + if (p->se.wait_start) + p->se.wait_start -= clock_offset; + if (p->se.sleep_start) + p->se.sleep_start -= clock_offset; + if (p->se.block_start) + p->se.block_start -= clock_offset; + if (old_cpu != new_cpu) { + schedstat_inc(p, se.nr_migrations); + if (task_hot(p, old_rq->clock, NULL)) + schedstat_inc(p, se.nr_forced2_migrations); + } +#endif + p->se.vruntime -= old_cfsrq->min_vruntime - + new_cfsrq->min_vruntime; + + __set_task_cpu(p, new_cpu); +} + +struct migration_req { + struct list_head list; + + struct task_struct *task; + int dest_cpu; + + struct completion done; +}; + +/* + * The task's runqueue lock must be held. + * Returns true if you have to wait for migration thread. + */ +static int +migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) +{ + struct rq *rq = task_rq(p); + + /* + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ + if (!p->se.on_rq && !task_running(rq, p)) { + set_task_cpu(p, dest_cpu); + return 0; + } + + init_completion(&req->done); + req->task = p; + req->dest_cpu = dest_cpu; + list_add(&req->list, &rq->migration_queue); + + return 1; +} + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, long match_state) +{ + unsigned long flags; + int running, on_rq; + unsigned long ncsw; + struct rq *rq; + + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_running()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; + cpu_relax(); + } + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &flags); + trace_sched_wait_task(rq, p); + running = task_running(rq, p); + on_rq = p->se.on_rq; + ncsw = 0; + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, &flags); + + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it wa still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(on_rq)) { + schedule_timeout_uninterruptible(1); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } + + return ncsw; +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + * + * NOTE: this function doesnt have to take the runqueue lock, + * because all it wants to ensure is that the remote task enters + * the kernel. If the IPI races and the task has been migrated + * to another CPU then no harm is done and the purpose has been + * achieved as well. + */ +void kick_process(struct task_struct *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +{ + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; + int imbalance = 100 + (sd->imbalance_pct-100)/2; + + do { + unsigned long load, avg_load; + int local_group; + int i; + + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu(i, sched_group_cpus(group)) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + + avg_load += load; + } + + /* Adjust by relative CPU power of the group */ + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); + + if (local_group) { + this_load = avg_load; + this = group; + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + } while (group = group->next, group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; +} + +/* + * find_idlest_cpu - find the idlest cpu among the cpus in group. + */ +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +{ + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + /* Traverse only the allowed CPUs */ + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + load = weighted_cpuload(i); + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; + } + } + + return idlest; +} + +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int sched_balance_self(int cpu, int flag) +{ + struct task_struct *t = current; + struct sched_domain *tmp, *sd = NULL; + + for_each_domain(cpu, tmp) { + /* + * If power savings logic is enabled for a domain, stop there. + */ + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + break; + if (tmp->flags & flag) + sd = tmp; + } + + if (sd) + update_shares(sd); + + while (sd) { + struct sched_group *group; + int new_cpu, weight; + + if (!(sd->flags & flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, t, cpu); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_cpu(group, t, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= cpumask_weight(sched_domain_span(tmp))) + break; + if (tmp->flags & flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return cpu; +} + +#endif /* CONFIG_SMP */ + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. + */ +static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +{ + int cpu, orig_cpu, this_cpu, success = 0; + unsigned long flags; + long old_state; + struct rq *rq; + + if (!sched_feat(SYNC_WAKEUPS)) + sync = 0; + +#ifdef CONFIG_SMP + if (sched_feat(LB_WAKEUP_UPDATE)) { + struct sched_domain *sd; + + this_cpu = raw_smp_processor_id(); + cpu = task_cpu(p); + + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + update_shares(sd); + break; + } + } + } +#endif + + smp_wmb(); + rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (p->se.on_rq) + goto out_running; + + cpu = task_cpu(p); + orig_cpu = cpu; + this_cpu = smp_processor_id(); + +#ifdef CONFIG_SMP + if (unlikely(task_running(rq, p))) + goto out_activate; + + cpu = p->sched_class->select_task_rq(p, sync); + if (cpu != orig_cpu) { + set_task_cpu(p, cpu); + task_rq_unlock(rq, &flags); + /* might preempt at this point */ + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + if (p->se.on_rq) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + } + +#ifdef CONFIG_SCHEDSTATS + schedstat_inc(rq, ttwu_count); + if (cpu == this_cpu) + schedstat_inc(rq, ttwu_local); + else { + struct sched_domain *sd; + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + } +#endif /* CONFIG_SCHEDSTATS */ + +out_activate: +#endif /* CONFIG_SMP */ + schedstat_inc(p, se.nr_wakeups); + if (sync) + schedstat_inc(p, se.nr_wakeups_sync); + if (orig_cpu != cpu) + schedstat_inc(p, se.nr_wakeups_migrate); + if (cpu == this_cpu) + schedstat_inc(p, se.nr_wakeups_local); + else + schedstat_inc(p, se.nr_wakeups_remote); + activate_task(rq, p, 1); + success = 1; + +out_running: + trace_sched_wakeup(rq, p, success); + check_preempt_curr(rq, p, sync); + + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_wake_up) + p->sched_class->task_wake_up(rq, p); +#endif +out: + current->se.last_wakeup = current->se.sum_exec_runtime; + + task_rq_unlock(rq, &flags); + + return success; +} +#endif /* !DDE_LINUX */ + +int wake_up_process(struct task_struct *p) +{ + return try_to_wake_up(p, TASK_ALL, 0); +} +EXPORT_SYMBOL(wake_up_process); + +int wake_up_state(struct task_struct *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0); +} + +#ifndef DDE_LINUX +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + * + * __sched_fork() is basic setup used by init_idle() too: + */ +static void __sched_fork(struct task_struct *p) +{ + p->se.exec_start = 0; + p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; + p->se.last_wakeup = 0; + p->se.avg_overlap = 0; + +#ifdef CONFIG_SCHEDSTATS + p->se.wait_start = 0; + p->se.sum_sleep_runtime = 0; + p->se.sleep_start = 0; + p->se.block_start = 0; + p->se.sleep_max = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.slice_max = 0; + p->se.wait_max = 0; +#endif + + INIT_LIST_HEAD(&p->rt.run_list); + p->se.on_rq = 0; + INIT_LIST_HEAD(&p->se.group_node); + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif + + /* + * We mark the process as running here, but have not actually + * inserted it onto the runqueue yet. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; +} + +/* + * fork()/clone()-time setup: + */ +void sched_fork(struct task_struct *p, int clone_flags) +{ + int cpu = get_cpu(); + + __sched_fork(p); + +#ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif + set_task_cpu(p, cpu); + + /* + * Make sure we do not leak PI boosting priority to the child: + */ + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) + p->sched_class = &fair_sched_class; + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (likely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + p->oncpu = 0; +#endif +#ifdef CONFIG_PREEMPT + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; +#endif + put_cpu(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +{ + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + BUG_ON(p->state != TASK_RUNNING); + update_rq_clock(rq); + + p->prio = effective_prio(p); + + if (!p->sched_class->task_new || !current->se.on_rq) { + activate_task(rq, p, 0); + } else { + /* + * Let the scheduling class do new task startup + * management (if any): + */ + p->sched_class->task_new(rq, p); + inc_nr_running(rq); + } + trace_sched_wakeup_new(rq, p, 1); + check_preempt_curr(rq, p, 0); +#ifdef CONFIG_SMP + if (p->sched_class->task_wake_up) + p->sched_class->task_wake_up(rq, p); +#endif + task_rq_unlock(rq, &flags); +} + +#ifdef CONFIG_PREEMPT_NOTIFIERS + +/** + * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * @notifier: notifier struct to register + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); +} +EXPORT_SYMBOL_GPL(preempt_notifier_register); + +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister + * + * This is safe to call from within a preemption notifier. + */ +void preempt_notifier_unregister(struct preempt_notifier *notifier) +{ + hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_out(notifier, next); +} + +#else /* !CONFIG_PREEMPT_NOTIFIERS */ + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ +} + +#endif /* CONFIG_PREEMPT_NOTIFIERS */ + +/** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @prev: the current task that is being switched out + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +{ + fire_sched_out_preempt_notifiers(prev, next); + prepare_lock_switch(rq, next); + prepare_arch_switch(next); +} + +/** + * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch + * @prev: the thread we just switched away from. + * + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static void finish_task_switch(struct rq *rq, struct task_struct *prev) + __releases(rq->lock) +{ + struct mm_struct *mm = rq->prev_mm; + long prev_state; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets TASK_DEAD in tsk->state and calls + * schedule one last time. The schedule call will never return, and + * the scheduled task must drop that reference. + * The test for TASK_DEAD must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul <manfred@colorfullife.com> + */ + prev_state = prev->state; + finish_arch_switch(prev); + finish_lock_switch(rq, prev); +#ifdef CONFIG_SMP + if (current->sched_class->post_schedule) + current->sched_class->post_schedule(rq); +#endif + + fire_sched_in_preempt_notifiers(current); + if (mm) + mmdrop(mm); + if (unlikely(prev_state == TASK_DEAD)) { + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); + put_task_struct(prev); + } +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(struct task_struct *prev) + __releases(rq->lock) +{ + struct rq *rq = this_rq(); + + finish_task_switch(rq, prev); +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); +#endif + if (current->set_child_tid) + put_user(task_pid_vnr(current), current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline void +context_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +{ + struct mm_struct *mm, *oldmm; + + prepare_task_switch(rq, prev, next); + trace_sched_switch(rq, prev, next); + mm = next->mm; + oldmm = prev->active_mm; + /* + * For paravirt, this is coupled with an exit in switch_to to + * combine the page table reload and the switch backend into + * one hypercall. + */ + arch_enter_lazy_cpu_mode(); + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + rq->prev_mm = oldmm; + } + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#endif + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + barrier(); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_possible_cpu(i) + sum += cpu_rq(i)->nr_uninterruptible; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +unsigned long long nr_context_switches(void) +{ + int i; + unsigned long long sum = 0; + + for_each_possible_cpu(i) + sum += cpu_rq(i)->nr_switches; + + return sum; +} + +unsigned long nr_iowait(void) +{ + unsigned long i, sum = 0; + + for_each_possible_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); + + return sum; +} + +unsigned long nr_active(void) +{ + unsigned long i, running = 0, uninterruptible = 0; + + for_each_online_cpu(i) { + running += cpu_rq(i)->nr_running; + uninterruptible += cpu_rq(i)->nr_uninterruptible; + } + + if (unlikely((long)uninterruptible < 0)) + uninterruptible = 0; + + return running + uninterruptible; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). + */ +static void update_cpu_load(struct rq *this_rq) +{ + unsigned long this_load = this_rq->load.weight; + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + } +} + +#ifdef CONFIG_SMP + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&rq2->lock); + spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } + } + update_rq_clock(rq1); + update_rq_clock(rq2); +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +/* + * If dest_cpu is allowed for this process, migrate the task to it. + * This is accomplished by forcing the cpu_allowed mask to only + * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * the cpu_allowed mask is restored. + */ +static void sched_migrate_task(struct task_struct *p, int dest_cpu) +{ + struct migration_req req; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) + || unlikely(!cpu_active(dest_cpu))) + goto out; + + /* force the process onto the specified CPU */ + if (migrate_task(p, dest_cpu, &req)) { + /* Need to wait for migration thread (might exit: take ref). */ + struct task_struct *mt = rq->migration_thread; + + get_task_struct(mt); + task_rq_unlock(rq, &flags); + wake_up_process(mt); + put_task_struct(mt); + wait_for_completion(&req.done); + + return; + } +out: + task_rq_unlock(rq, &flags); +} + +/* + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. + */ +void sched_exec(void) +{ + int new_cpu, this_cpu = get_cpu(); + new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); + put_cpu(); + if (new_cpu != this_cpu) + sched_migrate_task(current, new_cpu); +} + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) +{ + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + check_preempt_curr(this_rq, p, 0); +} + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { + schedstat_inc(p, se.nr_failed_migrations_affine); + return 0; + } + *all_pinned = 0; + + if (task_running(rq, p)) { + schedstat_inc(p, se.nr_failed_migrations_running); + return 0; + } + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + + if (!task_hot(p, rq->clock, sd) || + sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, rq->clock, sd)) { + schedstat_inc(sd, lb_hot_gained[idle]); + schedstat_inc(p, se.nr_forced_migrations); + } +#endif + return 1; + } + + if (task_hot(p, rq->clock, sd)) { + schedstat_inc(p, se.nr_failed_migrations_hot); + return 0; + } + return 1; +} + +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + int *this_best_prio, struct rq_iterator *iterator) +{ + int loops = 0, pulled = 0, pinned = 0; + struct task_struct *p; + long rem_load_move = max_load_move; + + if (max_load_move == 0) + goto out; + + pinned = 1; + + /* + * Start the load-balancing iterator: + */ + p = iterator->start(iterator->arg); +next: + if (!p || loops++ > sysctl_sched_nr_migrate) + goto out; + + if ((p->se.load.weight >> 1) > rem_load_move || + !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { + p = iterator->next(iterator->arg); + goto next; + } + + pull_task(busiest, p, this_rq, this_cpu); + pulled++; + rem_load_move -= p->se.load.weight; + + /* + * We only want to steal up to the prescribed amount of weighted load. + */ + if (rem_load_move > 0) { + if (p->prio < *this_best_prio) + *this_best_prio = p->prio; + p = iterator->next(iterator->arg); + goto next; + } +out: + /* + * Right now, this is one of only two places pull_task() is called, + * so we can safely collect pull_task() stats here rather than + * inside pull_task(). + */ + schedstat_add(sd, lb_gained[idle], pulled); + + if (all_pinned) + *all_pinned = pinned; + + return max_load_move - rem_load_move; +} + +/* + * move_tasks tries to move up to max_load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + const struct sched_class *class = sched_class_highest; + unsigned long total_load_moved = 0; + int this_best_prio = this_rq->curr->prio; + + do { + total_load_moved += + class->load_balance(this_rq, this_cpu, busiest, + max_load_move - total_load_moved, + sd, idle, all_pinned, &this_best_prio); + class = class->next; + + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) + break; + + } while (class && max_load_move > total_load_moved); + + return total_load_moved > 0; +} + +static int +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle, + struct rq_iterator *iterator) +{ + struct task_struct *p = iterator->start(iterator->arg); + int pinned = 0; + + while (p) { + if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { + pull_task(busiest, p, this_rq, this_cpu); + /* + * Right now, this is only the second place pull_task() + * is called, so we can safely collect pull_task() + * stats here rather than inside pull_task(). + */ + schedstat_inc(sd, lb_gained[idle]); + + return 1; + } + p = iterator->next(iterator->arg); + } + + return 0; +} + +/* + * move_one_task tries to move exactly one task from busiest to this_rq, as + * part of active balancing operations within "domain". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + const struct sched_class *class; + + for (class = sched_class_highest; class; class = class->next) + if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) + return 1; + + return 0; +} + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the amount of weighted load which + * should be moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum cpu_idle_type idle, + int *sd_idle, const struct cpumask *cpus, int *balance) +{ + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; + unsigned long max_pull; + unsigned long busiest_load_per_task, busiest_nr_running; + unsigned long this_load_per_task, this_nr_running; + int load_idx, group_imb = 0; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + int power_savings_balance = 1; + unsigned long leader_nr_running = 0, min_load_per_task = 0; + unsigned long min_nr_running = ULONG_MAX; + struct sched_group *group_min = NULL, *group_leader = NULL; +#endif + + max_load = this_load = total_load = total_pwr = 0; + busiest_load_per_task = busiest_nr_running = 0; + this_load_per_task = this_nr_running = 0; + + if (idle == CPU_NOT_IDLE) + load_idx = sd->busy_idx; + else if (idle == CPU_NEWLY_IDLE) + load_idx = sd->newidle_idx; + else + load_idx = sd->idle_idx; + + do { + unsigned long load, group_capacity, max_cpu_load, min_cpu_load; + int local_group; + int i; + int __group_imb = 0; + unsigned int balance_cpu = -1, first_idle_cpu = 0; + unsigned long sum_nr_running, sum_weighted_load; + unsigned long sum_avg_load_per_task; + unsigned long avg_load_per_task; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); + + if (local_group) + balance_cpu = cpumask_first(sched_group_cpus(group)); + + /* Tally up the load of all CPUs in the group */ + sum_weighted_load = sum_nr_running = avg_load = 0; + sum_avg_load_per_task = avg_load_per_task = 0; + + max_cpu_load = 0; + min_cpu_load = ~0UL; + + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); + + if (*sd_idle && rq->nr_running) + *sd_idle = 0; + + /* Bias balancing toward cpus of our domain */ + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + + load = target_load(i, load_idx); + } else { + load = source_load(i, load_idx); + if (load > max_cpu_load) + max_cpu_load = load; + if (min_cpu_load > load) + min_cpu_load = load; + } + + avg_load += load; + sum_nr_running += rq->nr_running; + sum_weighted_load += weighted_cpuload(i); + + sum_avg_load_per_task += cpu_avg_load_per_task(i); + } + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (idle != CPU_NEWLY_IDLE && local_group && + balance_cpu != this_cpu && balance) { + *balance = 0; + goto ret; + } + + total_load += avg_load; + total_pwr += group->__cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); + + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of two tasks. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + avg_load_per_task = sg_div_cpu_power(group, + sum_avg_load_per_task * SCHED_LOAD_SCALE); + + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) + __group_imb = 1; + + group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; + + if (local_group) { + this_load = avg_load; + this = group; + this_nr_running = sum_nr_running; + this_load_per_task = sum_weighted_load; + } else if (avg_load > max_load && + (sum_nr_running > group_capacity || __group_imb)) { + max_load = avg_load; + busiest = group; + busiest_nr_running = sum_nr_running; + busiest_load_per_task = sum_weighted_load; + group_imb = __group_imb; + } + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == CPU_NOT_IDLE || + !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto group_next; + + /* + * If the local group is idle or completely loaded + * no need to do power savings balance at this domain + */ + if (local_group && (this_nr_running >= group_capacity || + !this_nr_running)) + power_savings_balance = 0; + + /* + * If a group is already running at full capacity or idle, + * don't include that group in power savings calculations + */ + if (!power_savings_balance || sum_nr_running >= group_capacity + || !sum_nr_running) + goto group_next; + + /* + * Calculate the group which has the least non-idle load. + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sum_nr_running < min_nr_running) || + (sum_nr_running == min_nr_running && + cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group_min)))) { + group_min = group; + min_nr_running = sum_nr_running; + min_load_per_task = sum_weighted_load / + sum_nr_running; + } + + /* + * Calculate the group which is almost near its + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sum_nr_running <= group_capacity - 1) { + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group_leader)))) { + group_leader = group; + leader_nr_running = sum_nr_running; + } + } +group_next: +#endif + group = group->next; + } while (group != sd->groups); + + if (!busiest || this_load >= max_load || busiest_nr_running == 0) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if (this_load >= avg_load || + 100*max_load <= sd->imbalance_pct*this_load) + goto out_balanced; + + busiest_load_per_task /= busiest_nr_running; + if (group_imb) + busiest_load_per_task = min(busiest_load_per_task, avg_load); + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + if (max_load <= busiest_load_per_task) + goto out_balanced; + + /* + * In the presence of smp nice balancing, certain scenarios can have + * max load less than avg load(as we skip the groups at or below + * its cpu_power, while calculating max_load..) + */ + if (max_load < avg_load) { + *imbalance = 0; + goto small_imbalance; + } + + /* Don't want to pull so many tasks that a group would go idle */ + max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); + + /* How much load to actually move to equalise the imbalance */ + *imbalance = min(max_pull * busiest->__cpu_power, + (avg_load - this_load) * this->__cpu_power) + / SCHED_LOAD_SCALE; + + /* + * if *imbalance is less than the average load per runnable task + * there is no gaurantee that any tasks will be moved so we'll have + * a think about bumping its value to force at least one task to be + * moved + */ + if (*imbalance < busiest_load_per_task) { + unsigned long tmp, pwr_now, pwr_move; + unsigned int imbn; + +small_imbalance: + pwr_move = pwr_now = 0; + imbn = 2; + if (this_nr_running) { + this_load_per_task /= this_nr_running; + if (busiest_load_per_task > this_load_per_task) + imbn = 1; + } else + this_load_per_task = cpu_avg_load_per_task(this_cpu); + + if (max_load - this_load + busiest_load_per_task >= + busiest_load_per_task * imbn) { + *imbalance = busiest_load_per_task; + return busiest; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->__cpu_power * + min(busiest_load_per_task, max_load); + pwr_now += this->__cpu_power * + min(this_load_per_task, this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = sg_div_cpu_power(busiest, + busiest_load_per_task * SCHED_LOAD_SCALE); + if (max_load > tmp) + pwr_move += busiest->__cpu_power * + min(busiest_load_per_task, max_load - tmp); + + /* Amount of load we'd add */ + if (max_load * busiest->__cpu_power < + busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = sg_div_cpu_power(this, + max_load * busiest->__cpu_power); + else + tmp = sg_div_cpu_power(this, + busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += this->__cpu_power * + min(this_load_per_task, this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain throughput */ + if (pwr_move > pwr_now) + *imbalance = busiest_load_per_task; + } + + return busiest; + +out_balanced: +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto ret; + + if (this == group_leader && group_leader != group_min) { + *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + cpumask_first(sched_group_cpus(group_leader)); + } + return group_min; + } +#endif +ret: + *imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static struct rq * +find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, + unsigned long imbalance, const struct cpumask *cpus) +{ + struct rq *busiest = NULL, *rq; + unsigned long max_load = 0; + int i; + + for_each_cpu(i, sched_group_cpus(group)) { + unsigned long wl; + + if (!cpumask_test_cpu(i, cpus)) + continue; + + rq = cpu_rq(i); + wl = weighted_cpuload(i); + + if (rq->nr_running == 1 && wl > imbalance) + continue; + + if (wl > max_load) { + max_load = wl; + busiest = rq; + } + } + + return busiest; +} + +/* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. + */ +#define MAX_PINNED_INTERVAL 512 + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + */ +static int load_balance(int this_cpu, struct rq *this_rq, + struct sched_domain *sd, enum cpu_idle_type idle, + int *balance, struct cpumask *cpus) +{ + int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + struct sched_group *group; + unsigned long imbalance; + struct rq *busiest; + unsigned long flags; + + cpumask_setall(cpus); + + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as CPU_IDLE, instead of + * portraying it as CPU_NOT_IDLE. + */ + if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + sd_idle = 1; + + schedstat_inc(sd, lb_count[idle]); + +redo: + update_shares(sd); + group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, + cpus, balance); + + if (*balance == 0) + goto out_balanced; + + if (!group) { + schedstat_inc(sd, lb_nobusyg[idle]); + goto out_balanced; + } + + busiest = find_busiest_queue(group, idle, imbalance, cpus); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[idle]); + goto out_balanced; + } + + BUG_ON(busiest == this_rq); + + schedstat_add(sd, lb_imbalance[idle], imbalance); + + ld_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. ld_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + local_irq_save(flags); + double_rq_lock(this_rq, busiest); + ld_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, idle, &all_pinned); + double_rq_unlock(this_rq, busiest); + local_irq_restore(flags); + + /* + * some other cpu did the load balance for us. + */ + if (ld_moved && this_cpu != smp_processor_id()) + resched_cpu(this_cpu); + + /* All tasks on this runqueue were pinned by CPU affinity */ + if (unlikely(all_pinned)) { + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) + goto redo; + goto out_balanced; + } + } + + if (!ld_moved) { + schedstat_inc(sd, lb_failed[idle]); + sd->nr_balance_failed++; + + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + + spin_lock_irqsave(&busiest->lock, flags); + + /* don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpumask_test_cpu(this_cpu, + &busiest->curr->cpus_allowed)) { + spin_unlock_irqrestore(&busiest->lock, flags); + all_pinned = 1; + goto out_one_pinned; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + spin_unlock_irqrestore(&busiest->lock, flags); + if (active_balance) + wake_up_process(busiest->migration_thread); + + /* + * We've kicked active balancing, reset the failure + * counter. + */ + sd->nr_balance_failed = sd->cache_nice_tries+1; + } + } else + sd->nr_balance_failed = 0; + + if (likely(!active_balance)) { + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + } else { + /* + * If we've begun active balancing, start to back off. This + * case may not be covered by the all_pinned logic if there + * is only 1 task on the busy runqueue (because we don't call + * move_tasks). + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + } + + if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + ld_moved = -1; + + goto out; + +out_balanced: + schedstat_inc(sd, lb_balanced[idle]); + + sd->nr_balance_failed = 0; + +out_one_pinned: + /* tune up the balancing interval */ + if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || + (sd->balance_interval < sd->max_interval)) + sd->balance_interval *= 2; + + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + ld_moved = -1; + else + ld_moved = 0; +out: + if (ld_moved) + update_shares(sd); + return ld_moved; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). + * this_rq is locked. + */ +static int +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, + struct cpumask *cpus) +{ + struct sched_group *group; + struct rq *busiest = NULL; + unsigned long imbalance; + int ld_moved = 0; + int sd_idle = 0; + int all_pinned = 0; + + cpumask_setall(cpus); + + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as CPU_NOT_IDLE. + */ + if (sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + sd_idle = 1; + + schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); +redo: + update_shares_locked(this_rq, sd); + group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, + &sd_idle, cpus, NULL); + if (!group) { + schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); + goto out_balanced; + } + + busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); + goto out_balanced; + } + + BUG_ON(busiest == this_rq); + + schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); + + ld_moved = 0; + if (busiest->nr_running > 1) { + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + /* this_rq->clock is already updated */ + update_rq_clock(busiest); + ld_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, CPU_NEWLY_IDLE, + &all_pinned); + double_unlock_balance(this_rq, busiest); + + if (unlikely(all_pinned)) { + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) + goto redo; + } + } + + if (!ld_moved) { + int active_balance = 0; + + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + return -1; + + if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) + return -1; + + if (sd->nr_balance_failed++ < 2) + return -1; + + /* + * The only task running in a non-idle cpu can be moved to this + * cpu in an attempt to completely freeup the other CPU + * package. The same method used to move task in load_balance() + * have been extended for load_balance_newidle() to speedup + * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) + * + * The package power saving logic comes from + * find_busiest_group(). If there are no imbalance, then + * f_b_g() will return NULL. However when sched_mc={1,2} then + * f_b_g() will select a group from which a running task may be + * pulled to this cpu in order to make the other package idle. + * If there is no opportunity to make a package idle and if + * there are no imbalance, then f_b_g() will return NULL and no + * action will be taken in load_balance_newidle(). + * + * Under normal task pull operation due to imbalance, there + * will be more than one task in the source run queue and + * move_tasks() will succeed. ld_moved will be true and this + * active balance code will not be triggered. + */ + + /* Lock busiest in correct order while this_rq is held */ + double_lock_balance(this_rq, busiest); + + /* + * don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { + double_unlock_balance(this_rq, busiest); + all_pinned = 1; + return ld_moved; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + + double_unlock_balance(this_rq, busiest); + /* + * Should not call ttwu while holding a rq->lock + */ + spin_unlock(&this_rq->lock); + if (active_balance) + wake_up_process(busiest->migration_thread); + spin_lock(&this_rq->lock); + + } else + sd->nr_balance_failed = 0; + + update_shares_locked(this_rq, sd); + return ld_moved; + +out_balanced: + schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + return -1; + sd->nr_balance_failed = 0; + + return 0; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static void idle_balance(int this_cpu, struct rq *this_rq) +{ + struct sched_domain *sd; + int pulled_task = 0; + unsigned long next_balance = jiffies + HZ; + cpumask_var_t tmpmask; + + if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) + return; + + for_each_domain(this_cpu, sd) { + unsigned long interval; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + if (sd->flags & SD_BALANCE_NEWIDLE) + /* If we've pulled tasks over stop searching: */ + pulled_task = load_balance_newidle(this_cpu, this_rq, + sd, tmpmask); + + interval = msecs_to_jiffies(sd->balance_interval); + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + if (pulled_task) + break; + } + if (pulled_task || time_after(jiffies, this_rq->next_balance)) { + /* + * We are going idle. next_balance may be set based on + * a busy processor. So reset next_balance. + */ + this_rq->next_balance = next_balance; + } + free_cpumask_var(tmpmask); +} + +/* + * active_load_balance is run by migration threads. It pushes running tasks + * off the busiest CPU onto idle CPUs. It requires at least 1 task to be + * running on each physical CPU where possible, and avoids physical / + * logical imbalances. + * + * Called with busiest_rq locked. + */ +static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) +{ + int target_cpu = busiest_rq->push_cpu; + struct sched_domain *sd; + struct rq *target_rq; + + /* Is there any task to move? */ + if (busiest_rq->nr_running <= 1) + return; + + target_rq = cpu_rq(target_cpu); + + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + update_rq_clock(busiest_rq); + update_rq_clock(target_rq); + + /* Search for an sd spanning us and the target CPU. */ + for_each_domain(target_cpu, sd) { + if ((sd->flags & SD_LOAD_BALANCE) && + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + schedstat_inc(sd, alb_count); + + if (move_one_task(target_rq, target_cpu, busiest_rq, + sd, CPU_IDLE)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + double_unlock_balance(busiest_rq, target_rq); +} + +#ifdef CONFIG_NO_HZ +static struct { + atomic_t load_balancer; + cpumask_var_t cpu_mask; +} nohz ____cacheline_aligned = { + .load_balancer = ATOMIC_INIT(-1), +}; + +/* + * This routine will try to nominate the ilb (idle load balancing) + * owner among the cpus whose ticks are stopped. ilb owner will do the idle + * load balancing on behalf of all those cpus. If all the cpus in the system + * go into this tickless mode, then there will be no ilb owner (as there is + * no need for one) and all the cpus will sleep till the next wakeup event + * arrives... + * + * For the ilb owner, tick is not stopped. And this tick will be used + * for idle load balancing. ilb owner will still be part of + * nohz.cpu_mask.. + * + * While stopping the tick, this cpu will become the ilb owner if there + * is no other owner. And will be the owner till that cpu becomes busy + * or if all cpus in the system stop their ticks at which point + * there is no need for ilb owner. + * + * When the ilb owner becomes busy, it nominates another owner, during the + * next busy scheduler_tick() + */ +int select_nohz_load_balancer(int stop_tick) +{ + int cpu = smp_processor_id(); + + if (stop_tick) { + cpu_rq(cpu)->in_nohz_recently = 1; + + if (!cpu_active(cpu)) { + if (atomic_read(&nohz.load_balancer) != cpu) + return 0; + + /* + * If we are going offline and still the leader, + * give up! + */ + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + + return 0; + } + + cpumask_set_cpu(cpu, nohz.cpu_mask); + + /* time for ilb owner also to sleep */ + if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { + if (atomic_read(&nohz.load_balancer) == cpu) + atomic_set(&nohz.load_balancer, -1); + return 0; + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) + return 1; + } else if (atomic_read(&nohz.load_balancer) == cpu) + return 1; + } else { + if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) + return 0; + + cpumask_clear_cpu(cpu, nohz.cpu_mask); + + if (atomic_read(&nohz.load_balancer) == cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + } + return 0; +} +#endif + +static DEFINE_SPINLOCK(balancing); + +/* + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ +static void rebalance_domains(int cpu, enum cpu_idle_type idle) +{ + int balance = 1; + struct rq *rq = cpu_rq(cpu); + unsigned long interval; + struct sched_domain *sd; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; + int need_serialize; + cpumask_var_t tmp; + + /* Fails alloc? Rebalancing probably not a priority right now. */ + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) + return; + + for_each_domain(cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + interval = sd->balance_interval; + if (idle != CPU_IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + if (unlikely(!interval)) + interval = 1; + if (interval > HZ*NR_CPUS/10) + interval = HZ*NR_CPUS/10; + + need_serialize = sd->flags & SD_SERIALIZE; + + if (need_serialize) { + if (!spin_trylock(&balancing)) + goto out; + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { + if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { + /* + * We've pulled tasks over so either we're no + * longer idle, or one of our SMT siblings is + * not idle. + */ + idle = CPU_NOT_IDLE; + } + sd->last_balance = jiffies; + } + if (need_serialize) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) { + next_balance = sd->last_balance + interval; + update_next_balance = 1; + } + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!balance) + break; + } + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; + + free_cpumask_var(tmp); +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * In CONFIG_NO_HZ case, the idle load balance owner will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static void run_rebalance_domains(struct softirq_action *h) +{ + int this_cpu = smp_processor_id(); + struct rq *this_rq = cpu_rq(this_cpu); + enum cpu_idle_type idle = this_rq->idle_at_tick ? + CPU_IDLE : CPU_NOT_IDLE; + + rebalance_domains(this_cpu, idle); + +#ifdef CONFIG_NO_HZ + /* + * If this cpu is the owner for idle load balancing, then do the + * balancing on behalf of the other idle cpus whose ticks are + * stopped. + */ + if (this_rq->idle_at_tick && + atomic_read(&nohz.load_balancer) == this_cpu) { + struct rq *rq; + int balance_cpu; + + for_each_cpu(balance_cpu, nohz.cpu_mask) { + if (balance_cpu == this_cpu) + continue; + + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) + break; + + rebalance_domains(balance_cpu, CPU_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; + } + } +#endif +} + +/* + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. + * + * In case of CONFIG_NO_HZ, this is the place where we nominate a new + * idle load balancing owner or decide to stop the periodic load balancing, + * if the whole system is idle. + */ +static inline void trigger_load_balance(struct rq *rq, int cpu) +{ +#ifdef CONFIG_NO_HZ + /* + * If we were in the nohz mode recently and busy at the current + * scheduler tick, then check if we need to nominate new idle + * load balancer. + */ + if (rq->in_nohz_recently && !rq->idle_at_tick) { + rq->in_nohz_recently = 0; + + if (atomic_read(&nohz.load_balancer) == cpu) { + cpumask_clear_cpu(cpu, nohz.cpu_mask); + atomic_set(&nohz.load_balancer, -1); + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* + * simple selection for now: Nominate the + * first cpu in the nohz list to be the next + * ilb owner. + * + * TBD: Traverse the sched domains and nominate + * the nearest cpu in the nohz.cpu_mask. + */ + int ilb = cpumask_first(nohz.cpu_mask); + + if (ilb < nr_cpu_ids) + resched_cpu(ilb); + } + } + + /* + * If this cpu is idle and doing idle load balancing for all the + * cpus with ticks stopped, is it time for that to stop? + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && + cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { + resched_cpu(cpu); + return; + } + + /* + * If this cpu is idle and the idle load balancing is done by + * someone else, then no need raise the SCHED_SOFTIRQ + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && + cpumask_test_cpu(cpu, nohz.cpu_mask)) + return; +#endif + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); +} + +#else /* CONFIG_SMP */ + +/* + * on UP we do not need to balance between CPUs: + */ +static inline void idle_balance(int cpu, struct rq *rq) +{ +} + +#endif + +DEFINE_PER_CPU(struct kernel_stat, kstat); + +EXPORT_PER_CPU_SYMBOL(kstat); + +/* + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. + */ +unsigned long long task_delta_exec(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 ns = 0; + + rq = task_rq_lock(p, &flags); + + if (task_current(rq, p)) { + u64 delta_exec; + + update_rq_clock(rq); + delta_exec = rq->clock - p->se.exec_start; + if ((s64)delta_exec > 0) + ns = delta_exec; + } + + task_rq_unlock(rq, &flags); + + return ns; +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + /* Add user time to process. */ + p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); + account_group_user_time(p, cputime); + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (TASK_NICE(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); + /* Account for user time used */ + acct_update_integrals(p); +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + cputime64_t tmp; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + tmp = cputime_to_cputime64(cputime); + + /* Add guest time to process. */ + p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); + account_group_user_time(p, cputime); + p->gtime = cputime_add(p->gtime, cputime); + + /* Add guest time to cpustat. */ + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime, cputime_t cputime_scaled) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime, cputime_scaled); + return; + } + + /* Add system time to process. */ + p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else + cpustat->system = cputime64_add(cpustat->system, tmp); + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* + * Account for involuntary wait time. + * @steal: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy = jiffies_to_cputime(1); + cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + struct rq *rq = this_rq(); + + if (user_tick) + account_user_time(p, one_jiffy, one_jiffy_scaled); + else if (p != rq->idle) + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + one_jiffy_scaled); + else + account_idle_time(one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +cputime_t task_utime(struct task_struct *p) +{ + return p->utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + return p->stime; +} +#else +cputime_t task_utime(struct task_struct *p) +{ + clock_t utime = cputime_to_clock_t(p->utime), + total = utime + cputime_to_clock_t(p->stime); + u64 temp; + + /* + * Use CFS's precise accounting: + */ + temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + + if (total) { + temp *= utime; + do_div(temp, total); + } + utime = (clock_t)temp; + + p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + return p->prev_utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + clock_t stime; + + /* + * Use CFS's precise accounting. (we subtract utime from + * the total, to make sure the total observed by userspace + * grows monotonically - apps rely on that): + */ + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - + cputime_to_clock_t(task_utime(p)); + + if (stime >= 0) + p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + + return p->prev_stime; +} +#endif + +inline cputime_t task_gtime(struct task_struct *p) +{ + return p->gtime; +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct task_struct *curr = rq->curr; + + sched_clock_tick(); + + spin_lock(&rq->lock); + update_rq_clock(rq); + update_cpu_load(rq); + curr->sched_class->task_tick(rq, curr, 0); + spin_unlock(&rq->lock); + +#ifdef CONFIG_SMP + rq->idle_at_tick = idle_cpu(cpu); + trigger_load_balance(rq, cpu); +#endif +} + +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +static inline unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} + +void __kprobes add_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) + return; +#endif + preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Spinlock count overflowing soon? + */ + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +} +EXPORT_SYMBOL(add_preempt_count); + +void __kprobes sub_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) + return; + /* + * Is the spinlock portion underflowing? + */ + if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; +#endif + + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * Print scheduling while atomic bug: + */ +static noinline void __schedule_bug(struct task_struct *prev) +{ + struct pt_regs *regs = get_irq_regs(); + + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); + + debug_show_held_locks(prev); + print_modules(); + if (irqs_disabled()) + print_irqtrace_events(prev); + + if (regs) + show_regs(regs); + else + dump_stack(); +} + +/* + * Various schedule()-time debugging checks and statistics: + */ +static inline void schedule_debug(struct task_struct *prev) +{ + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) + __schedule_bug(prev); + + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + + schedstat_inc(this_rq(), sched_count); +#ifdef CONFIG_SCHEDSTATS + if (unlikely(prev->lock_depth >= 0)) { + schedstat_inc(this_rq(), bkl_count); + schedstat_inc(prev, sched_info.bkl_count); + } +#endif +} + +/* + * Pick up the highest-prio task: + */ +static inline struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev) +{ + const struct sched_class *class; + struct task_struct *p; + + /* + * Optimization: we know that if all tasks are in + * the fair class we can call that function directly: + */ + if (likely(rq->nr_running == rq->cfs.nr_running)) { + p = fair_sched_class.pick_next_task(rq); + if (likely(p)) + return p; + } + + class = sched_class_highest; + for ( ; ; ) { + p = class->pick_next_task(rq); + if (p) + return p; + /* + * Will never be NULL as the idle class always + * returns a non-NULL p: + */ + class = class->next; + } +} + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + struct task_struct *prev, *next; + unsigned long *switch_count; + struct rq *rq; + int cpu; + +need_resched: + preempt_disable(); + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + rcu_qsctr_inc(cpu); + prev = rq->curr; + switch_count = &prev->nivcsw; + + release_kernel_lock(prev); +need_resched_nonpreemptible: + + schedule_debug(prev); + + if (sched_feat(HRTICK)) + hrtick_clear(rq); + + spin_lock_irq(&rq->lock); + update_rq_clock(rq); + clear_tsk_need_resched(prev); + + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (unlikely(signal_pending_state(prev->state, prev))) + prev->state = TASK_RUNNING; + else + deactivate_task(rq, prev, 1); + switch_count = &prev->nvcsw; + } + +#ifdef CONFIG_SMP + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +#endif + + if (unlikely(!rq->nr_running)) + idle_balance(cpu, rq); + + prev->sched_class->put_prev_task(rq, prev); + next = pick_next_task(rq, prev); + + if (likely(prev != next)) { + sched_info_switch(prev, next); + + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + context_switch(rq, prev, next); /* unlocks the rq */ + /* + * the context switch might have flipped the stack from under + * us, hence refresh the local variables. + */ + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + } else + spin_unlock_irq(&rq->lock); + + if (unlikely(reacquire_kernel_lock(current) < 0)) + goto need_resched_nonpreemptible; + + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_PREEMPT +/* + * this is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); + + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (likely(ti->preempt_count || irqs_disabled())) + return; + + do { + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); +} +EXPORT_SYMBOL(preempt_schedule); + +/* + * this is the entry point to schedule() from kernel preemption + * off of irq context. + * Note, that this is called and return with irqs disabled. This will + * protect us against recursive calling from irq. + */ +asmlinkage void __sched preempt_schedule_irq(void) +{ + struct thread_info *ti = current_thread_info(); + + /* Catch callers which need to be fixed */ + BUG_ON(ti->preempt_count || !irqs_disabled()); + + do { + add_preempt_count(PREEMPT_ACTIVE); + local_irq_enable(); + schedule(); + local_irq_disable(); + sub_preempt_count(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); +} + +#endif /* CONFIG_PREEMPT */ +#endif /* !DDE_LINUX */ + +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, + void *key) +{ + return try_to_wake_up(curr->private, mode, sync); +} +EXPORT_SYMBOL(default_wake_function); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key) +{ + wait_queue_t *curr, *next; + + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + unsigned flags = curr->flags; + + if (curr->func(curr, mode, sync, key) && + (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 1, 0, NULL); +} + +/** + * __wake_up_sync - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + */ +void +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + int sync = 1; + + if (unlikely(!q)) + return; + + if (unlikely(!nr_exclusive)) + sync = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, sync, NULL); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + */ +void complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + */ +void complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, long timeout, int state) +{ + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + if (signal_pending_state(state, current)) { + timeout = -ERESTARTSYS; + break; + } + __set_current_state(state); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&x->wait.lock); + } while (!x->done && timeout); + __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; + } + x->done--; + return timeout ?: 1; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + timeout = do_wait_for_common(x, timeout, state); + spin_unlock_irq(&x->wait.lock); + return timeout; +} + +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + */ +unsigned long __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Returns: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Returns: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(completion_done); + +static long __sched +sleep_on_common(wait_queue_head_t *q, int state, long timeout) +{ + unsigned long flags; + wait_queue_t wait; + + init_waitqueue_entry(&wait, current); + + __set_current_state(state); + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, &wait); + spin_unlock(&q->lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&q->lock); + __remove_wait_queue(q, &wait); + spin_unlock_irqrestore(&q->lock, flags); + + return timeout; +} + +#ifndef DDE_LINUX +void __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(interruptible_sleep_on); + +long __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void __sched sleep_on(wait_queue_head_t *q) +{ + sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(sleep_on); + +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(sleep_on_timeout); + +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(struct task_struct *p, int prio) +{ + unsigned long flags; + int oldprio, on_rq, running; + struct rq *rq; + const struct sched_class *prev_class = p->sched_class; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + + oldprio = p->prio; + on_rq = p->se.on_rq; + running = task_current(rq, p); + if (on_rq) + dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + + p->prio = prio; + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) { + enqueue_task(rq, p, 0); + + check_class_changed(rq, p, prev_class, oldprio, running); + } + task_rq_unlock(rq, &flags); +} + +#endif + +void set_user_nice(struct task_struct *p, long nice) +{ + int old_prio, delta, on_rq; + unsigned long flags; + struct rq *rq; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * SCHED_FIFO/SCHED_RR: + */ + if (task_has_rt_policy(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + on_rq = p->se.on_rq; + if (on_rq) + dequeue_task(rq, p, 0); + + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; + + if (on_rq) { + enqueue_task(rq, p, 0); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, &flags); +} +EXPORT_SYMBOL(set_user_nice); + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ + /* convert nice value [19,-20] to rlimit style value [1,40] */ + int nice_rlim = 20 - nice; + + return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || + capable(CAP_SYS_NICE)); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +SYSCALL_DEFINE1(nice, int, increment) +{ + long nice, retval; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + if (increment < -40) + increment = -40; + if (increment > 40) + increment = 40; + + nice = PRIO_TO_NICE(current->static_prio) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + + if (increment < 0 && !can_nice(current, nice)) + return -EPERM; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(const struct task_struct *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(const struct task_struct *p) +{ + return TASK_NICE(p); +} +EXPORT_SYMBOL(task_nice); + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +/** + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. + */ +struct task_struct *idle_task(int cpu) +{ + return cpu_rq(cpu)->idle; +} + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static struct task_struct *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_vpid(pid) : current; +} + +/* Actually do priority change: must hold rq lock. */ +static void +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +{ + BUG_ON(p->se.on_rq); + + p->policy = policy; + switch (p->policy) { + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + p->sched_class = &fair_sched_class; + break; + case SCHED_FIFO: + case SCHED_RR: + p->sched_class = &rt_sched_class; + break; + } + + p->rt_priority = prio; + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); + set_load_weight(p); +} +#endif + +/* + * check the target process has a UID that matches the current process's + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + bool match; + + rcu_read_lock(); + pcred = __task_cred(p); + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + rcu_read_unlock(); + return match; +} + +static int __sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param, bool user) +{ +#ifndef DDE_LINUX + int retval, oldprio, oldpolicy = -1, on_rq, running; + unsigned long flags; + const struct sched_class *prev_class = p->sched_class; + struct rq *rq; + + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); +recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; + else if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH and SCHED_IDLE is 0. + */ + if (param->sched_priority < 0 || + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) + return -EINVAL; + if (rt_policy(policy) != (param->sched_priority != 0)) + return -EINVAL; + + /* + * Allow unprivileged RT tasks to decrease priority: + */ + if (user && !capable(CAP_SYS_NICE)) { + if (rt_policy(policy)) { + unsigned long rlim_rtprio; + + if (!lock_task_sighand(p, &flags)) + return -ESRCH; + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + unlock_task_sighand(p, &flags); + + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; + + /* can't increase priority */ + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; + } + /* + * Like positive nice levels, dont allow tasks to + * move out of SCHED_IDLE either: + */ + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) + return -EPERM; + + /* can't change other user's priorities */ + if (!check_same_owner(p)) + return -EPERM; + } + + if (user) { +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) + return -EPERM; +#endif + + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; + } + + /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + */ + spin_lock_irqsave(&p->pi_lock, flags); + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = __task_rq_lock(p); + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + goto recheck; + } + update_rq_clock(rq); + on_rq = p->se.on_rq; + running = task_current(rq, p); + if (on_rq) + deactivate_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + oldprio = p->prio; + __setscheduler(rq, p, policy, param->sched_priority); + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) { + activate_task(rq, p, 0); + + check_class_changed(rq, p, prev_class, oldprio, running); + } + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + + rt_mutex_adjust_pi(p); + + return 0; +#else + //WARN_UNIMPL; + return 0; +#endif +} + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, true); +} +EXPORT_SYMBOL_GPL(sched_setscheduler); + +#ifndef DDE_LINUX + +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, false); +} + +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lparam; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) + retval = sched_setscheduler(p, policy, &lparam); + rcu_read_unlock(); + + return retval; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, + struct sched_param __user *, param) +{ + /* negative values for policy are not valid */ + if (policy < 0) + return -EINVAL; + + return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) +{ + return do_sched_setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +{ + struct task_struct *p; + int retval; + + if (pid < 0) + return -EINVAL; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy; + } + read_unlock(&tasklist_lock); + return retval; +} + +/** + * sys_sched_getscheduler - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) +{ + struct sched_param lp; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + lp.sched_priority = p->rt_priority; + read_unlock(&tasklist_lock); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + + return retval; + +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + cpumask_var_t cpus_allowed, new_mask; + struct task_struct *p; + int retval; + + get_online_cpus(); + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + put_online_cpus(); + return -ESRCH; + } + + /* + * It is not safe to call set_cpus_allowed with the + * tasklist_lock held. We will bump the task_struct's + * usage count and then drop tasklist_lock. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + retval = -EPERM; + if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = security_task_setscheduler(p, 0, NULL); + if (retval) + goto out_unlock; + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); + again: + retval = set_cpus_allowed_ptr(p, new_mask); + + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } + } +out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: + put_task_struct(p); + put_online_cpus(); + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + struct cpumask *new_mask) +{ + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + cpumask_var_t new_mask; + int retval; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; +} + +long sched_getaffinity(pid_t pid, struct cpumask *mask) +{ + struct task_struct *p; + int retval; + + get_online_cpus(); + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + +out_unlock: + read_unlock(&tasklist_lock); + put_online_cpus(); + + return retval; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + int ret; + cpumask_var_t mask; + + if (len < cpumask_size()) + return -EINVAL; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + ret = -EFAULT; + else + ret = cpumask_size(); + } + free_cpumask_var(mask); + + return ret; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + */ +SYSCALL_DEFINE0(sched_yield) +{ + struct rq *rq = this_rq_lock(); + + schedstat_inc(rq, yld_count); + current->sched_class->yield_task(rq); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} +#endif /* !DDE_LINUX */ + +static void __cond_resched(void) +{ +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP + __might_sleep(__FILE__, __LINE__); +#endif + /* + * The BKS might be reacquired before we have dropped + * PREEMPT_ACTIVE, which could trigger a second + * cond_resched() call. + */ + do { + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + } while (need_resched()); +} + +int __sched _cond_resched(void) +{ + if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && + system_state == SYSTEM_RUNNING) { + __cond_resched(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(_cond_resched); + +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int cond_resched_lock(spinlock_t *lock) +{ + int resched = need_resched() && system_state == SYSTEM_RUNNING; + int ret = 0; + + if (spin_needbreak(lock) || resched) { + spin_unlock(lock); + if (resched && need_resched()) + __cond_resched(); + else + cpu_relax(); + ret = 1; + spin_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(cond_resched_lock); + +int __sched cond_resched_softirq(void) +{ + BUG_ON(!in_softirq()); + + if (need_resched() && system_state == SYSTEM_RUNNING) { + local_bh_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_softirq); + +#ifndef DDE_LINUX +/** + * yield - yield the current processor to other threads. + * + * This is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} +EXPORT_SYMBOL(yield); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +void __sched io_schedule(void) +{ + struct rq *rq = &__raw_get_cpu_var(runqueues); + + delayacct_blkio_start(); + atomic_inc(&rq->nr_iowait); + schedule(); + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); +} +EXPORT_SYMBOL(io_schedule); + +long __sched io_schedule_timeout(long timeout) +{ + struct rq *rq = &__raw_get_cpu_var(runqueues); + long ret; + + delayacct_blkio_start(); + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); + return ret; +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + } + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) +{ + struct task_struct *p; + unsigned int time_slice; + int retval; + struct timespec t; + + if (pid < 0) + return -EINVAL; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + /* + * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER + * tasks that are on an otherwise idle runqueue: + */ + time_slice = 0; + if (p->policy == SCHED_RR) { + time_slice = DEF_TIMESLICE; + } else if (p->policy != SCHED_FIFO) { + struct sched_entity *se = &p->se; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + if (rq->cfs.load.weight) + time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + task_rq_unlock(rq, &flags); + } + read_unlock(&tasklist_lock); + jiffies_to_timespec(time_slice, &t); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + return retval; + +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + +void sched_show_task(struct task_struct *p) +{ + unsigned long free = 0; + unsigned state; + + state = p->state ? __ffs(p->state) + 1 : 0; + printk(KERN_INFO "%-13.13s %c", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); +#if BITS_PER_LONG == 32 + if (state == TASK_RUNNING) + printk(KERN_CONT " running "); + else + printk(KERN_CONT " %08lx ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(KERN_CONT " running task "); + else + printk(KERN_CONT " %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + { + unsigned long *n = end_of_stack(p); + while (!*n) + n++; + free = (unsigned long)n - (unsigned long)end_of_stack(p); + } +#endif + printk(KERN_CONT "%5lu %5d %6d\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent)); + + show_stack(p, NULL); +} + +void show_state_filter(unsigned long state_filter) +{ + struct task_struct *g, *p; + +#if BITS_PER_LONG == 32 + printk(KERN_INFO + " task PC stack pid father\n"); +#else + printk(KERN_INFO + " task PC stack pid father\n"); +#endif + read_lock(&tasklist_lock); + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: + */ + touch_nmi_watchdog(); + if (!state_filter || (p->state & state_filter)) + sched_show_task(p); + } while_each_thread(g, p); + + touch_all_softlockup_watchdogs(); + +#ifdef CONFIG_SCHED_DEBUG + sysrq_sched_debug_show(); +#endif + read_unlock(&tasklist_lock); + /* + * Only show locks if all tasks are dumped: + */ + if (state_filter == -1) + debug_show_all_locks(); +} + +void __cpuinit init_idle_bootup_task(struct task_struct *idle) +{ + idle->sched_class = &idle_sched_class; +} + +/** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question + * @cpu: cpu the idle task belongs to + * + * NOTE: this function does not set the idle thread's NEED_RESCHED + * flag, to make booting more robust. + */ +void __cpuinit init_idle(struct task_struct *idle, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + + __sched_fork(idle); + idle->se.exec_start = sched_clock(); + + idle->prio = idle->normal_prio = MAX_PRIO; + cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + __set_task_cpu(idle, cpu); + + rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; +#endif + spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) + task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); +#else + task_thread_info(idle)->preempt_count = 0; +#endif + /* + * The idle tasks have their own, simple scheduling class: + */ + idle->sched_class = &idle_sched_class; + ftrace_graph_init_task(idle); +} +#endif /* DDE_LINUX */ + +/* + * In a system that switches off the HZ timer nohz_cpu_mask + * indicates which cpus entered this state. This is used + * in the rcu update to wait only for active cpus. For system + * which do not switch off the HZ timer nohz_cpu_mask should + * always be CPU_BITS_NONE. + */ +cpumask_var_t nohz_cpu_mask; + +#ifndef DDE_LINUX +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static inline void sched_init_granularity(void) +{ + unsigned int factor = 1 + ilog2(num_online_cpus()); + const unsigned long limit = 200000000; + + sysctl_sched_min_granularity *= factor; + if (sysctl_sched_min_granularity > limit) + sysctl_sched_min_granularity = limit; + + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; + + sysctl_sched_wakeup_granularity *= factor; + + sysctl_sched_shares_ratelimit *= factor; +} + +#ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we queue a struct migration_req structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. + */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + struct migration_req req; + unsigned long flags; + struct rq *rq; + int ret = 0; + + rq = task_rq_lock(p, &flags); + if (!cpumask_intersects(new_mask, cpu_online_mask)) { + ret = -EINVAL; + goto out; + } + + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && + !cpumask_equal(&p->cpus_allowed, new_mask))) { + ret = -EINVAL; + goto out; + } + + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); + else { + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + } + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + tlb_migrate_finish(p->mm); + return 0; + } +out: + task_rq_unlock(rq, &flags); + + return ret; +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + * + * Returns non-zero if task was successfully migrated. + */ +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +{ + struct rq *rq_dest, *rq_src; + int ret = 0, on_rq; + + if (unlikely(!cpu_active(dest_cpu))) + return ret; + + rq_src = cpu_rq(src_cpu); + rq_dest = cpu_rq(dest_cpu); + + double_rq_lock(rq_src, rq_dest); + /* Already moved. */ + if (task_cpu(p) != src_cpu) + goto done; + /* Affinity changed (again). */ + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto fail; + + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq_src, p, 0); + + set_task_cpu(p, dest_cpu); + if (on_rq) { + activate_task(rq_dest, p, 0); + check_preempt_curr(rq_dest, p, 0); + } +done: + ret = 1; +fail: + double_rq_unlock(rq_src, rq_dest); + return ret; +} + +/* + * migration_thread - this is a highprio system thread that performs + * thread migration by bumping thread off CPU then 'pushing' onto + * another runqueue. + */ +static int migration_thread(void *data) +{ + int cpu = (long)data; + struct rq *rq; + + rq = cpu_rq(cpu); + BUG_ON(rq->migration_thread != current); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + struct migration_req *req; + struct list_head *head; + + spin_lock_irq(&rq->lock); + + if (cpu_is_offline(cpu)) { + spin_unlock_irq(&rq->lock); + goto wait_to_die; + } + + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + + head = &rq->migration_queue; + + if (list_empty(head)) { + spin_unlock_irq(&rq->lock); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + continue; + } + req = list_entry(head->next, struct migration_req, list); + list_del_init(head->next); + + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + local_irq_enable(); + + complete(&req->done); + } + __set_current_state(TASK_RUNNING); + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) +{ + int ret; + + local_irq_disable(); + ret = __migrate_task(p, src_cpu, dest_cpu); + local_irq_enable(); + return ret; +} + +/* + * Figure out where task on dead CPU should go, use force if necessary. + */ +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +{ + int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); + +again: + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto move; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + if (dest_cpu < nr_cpu_ids) + goto move; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); + } + } + +move: + /* It can have affinity changed while we were choosing. */ + if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) + goto again; +} + +/* + * While a dead CPU has no uninterruptible tasks queued at this point, + * it might still have a nonzero ->nr_uninterruptible counter, because + * for performance reasons the counter is not stricly tracking tasks to + * their home CPUs. So we just add the counter to another CPU's counter, + * to keep the global sum constant after CPU-down: + */ +static void migrate_nr_uninterruptible(struct rq *rq_src) +{ + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); + unsigned long flags; + + local_irq_save(flags); + double_rq_lock(rq_src, rq_dest); + rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; + rq_src->nr_uninterruptible = 0; + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); +} + +/* Run through task list and migrate tasks from the dead cpu. */ +static void migrate_live_tasks(int src_cpu) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + + do_each_thread(t, p) { + if (p == current) + continue; + + if (task_cpu(p) == src_cpu) + move_task_off_dead_cpu(src_cpu, p); + } while_each_thread(t, p); + + read_unlock(&tasklist_lock); +} + +/* + * Schedules idle task to be the next runnable task on current CPU. + * It does so by boosting its priority to highest possible. + * Used by CPU offline code. + */ +void sched_idle_next(void) +{ + int this_cpu = smp_processor_id(); + struct rq *rq = cpu_rq(this_cpu); + struct task_struct *p = rq->idle; + unsigned long flags; + + /* cpu has to be offline */ + BUG_ON(cpu_online(this_cpu)); + + /* + * Strictly not necessary since rest of the CPUs are stopped by now + * and interrupts disabled on the current cpu. + */ + spin_lock_irqsave(&rq->lock, flags); + + __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); + + update_rq_clock(rq); + activate_task(rq, p, 0); + + spin_unlock_irqrestore(&rq->lock, flags); +} + +/* + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. + */ +void idle_task_exit(void) +{ + struct mm_struct *mm = current->active_mm; + + BUG_ON(cpu_online(smp_processor_id())); + + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); +} + +/* called under rq->lock with disabled interrupts */ +static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) +{ + struct rq *rq = cpu_rq(dead_cpu); + + /* Must be exiting, otherwise would be on tasklist. */ + BUG_ON(!p->exit_state); + + /* Cannot have done final schedule yet: would have vanished. */ + BUG_ON(p->state == TASK_DEAD); + + get_task_struct(p); + + /* + * Drop lock around migration; if someone else moves it, + * that's OK. No task can be added to this CPU, so iteration is + * fine. + */ + spin_unlock_irq(&rq->lock); + move_task_off_dead_cpu(dead_cpu, p); + spin_lock_irq(&rq->lock); + + put_task_struct(p); +} + +/* release_task() removes task from tasklist, so we won't find dead tasks. */ +static void migrate_dead_tasks(unsigned int dead_cpu) +{ + struct rq *rq = cpu_rq(dead_cpu); + struct task_struct *next; + + for ( ; ; ) { + if (!rq->nr_running) + break; + update_rq_clock(rq); + next = pick_next_task(rq, rq->curr); + if (!next) + break; + next->sched_class->put_prev_task(rq, next); + migrate_dead(dead_cpu, next); + + } +} +#endif /* CONFIG_HOTPLUG_CPU */ + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {0, }, +}; + +static struct ctl_table sd_ctl_root[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {0, }, +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + + return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry; + + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { + if (entry->child) + sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } + + kfree(*tablep); + *tablep = NULL; +} + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + mode_t mode, proc_handler *proc_handler) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(13); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[9], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[10], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[12] is terminator */ + + return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void register_sched_domain_sysctl(void) +{ + int i, cpu_num = num_online_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + + if (entry == NULL) + return; + + for_each_online_cpu(i) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + entry++; + } + + WARN_ON(sd_sysctl_header); + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +static void unregister_sched_domain_sysctl(void) +{ + if (sd_sysctl_header) + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#else +static void register_sched_domain_sysctl(void) +{ +} +static void unregister_sched_domain_sysctl(void) +{ +} +#endif + +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + const struct sched_class *class; + + cpumask_set_cpu(rq->cpu, rq->rd->online); + rq->online = 1; + + for_each_class(class) { + if (class->rq_online) + class->rq_online(rq); + } + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + const struct sched_class *class; + + for_each_class(class) { + if (class->rq_offline) + class->rq_offline(rq); + } + + cpumask_clear_cpu(rq->cpu, rq->rd->online); + rq->online = 0; + } +} + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int __cpuinit +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + struct task_struct *p; + int cpu = (long)hcpu; + unsigned long flags; + struct rq *rq; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + kthread_bind(p, cpu); + /* Must be high prio: stop_machine expects to yield to it. */ + rq = task_rq_lock(p, &flags); + __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); + task_rq_unlock(rq, &flags); + cpu_rq(cpu)->migration_thread = p; + break; + + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + /* Strictly unnecessary, as first user will wake it. */ + wake_up_process(cpu_rq(cpu)->migration_thread); + + /* Update our root-domain */ + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + + set_rq_online(rq); + } + spin_unlock_irqrestore(&rq->lock, flags); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!cpu_rq(cpu)->migration_thread) + break; + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(cpu_rq(cpu)->migration_thread, + cpumask_any(cpu_online_mask)); + kthread_stop(cpu_rq(cpu)->migration_thread); + cpu_rq(cpu)->migration_thread = NULL; + break; + + case CPU_DEAD: + case CPU_DEAD_FROZEN: + cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ + migrate_live_tasks(cpu); + rq = cpu_rq(cpu); + kthread_stop(rq->migration_thread); + rq->migration_thread = NULL; + /* Idle task back to normal (off runqueue, low prio) */ + spin_lock_irq(&rq->lock); + update_rq_clock(rq); + deactivate_task(rq, rq->idle, 0); + rq->idle->static_prio = MAX_PRIO; + __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); + rq->idle->sched_class = &idle_sched_class; + migrate_dead_tasks(cpu); + spin_unlock_irq(&rq->lock); + cpuset_unlock(); + migrate_nr_uninterruptible(rq); + BUG_ON(rq->nr_running != 0); + + /* + * No need to migrate the tasks: it was best-effort if + * they didn't take sched_hotcpu_mutex. Just wake up + * the requestors. + */ + spin_lock_irq(&rq->lock); + while (!list_empty(&rq->migration_queue)) { + struct migration_req *req; + + req = list_entry(rq->migration_queue.next, + struct migration_req, list); + list_del_init(&req->list); + spin_unlock_irq(&rq->lock); + complete(&req->done); + spin_lock_irq(&rq->lock); + } + spin_unlock_irq(&rq->lock); + break; + + case CPU_DYING: + case CPU_DYING_FROZEN: + /* Update our root-domain */ + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + spin_unlock_irqrestore(&rq->lock, flags); + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __cpuinitdata migration_notifier = { + .notifier_call = migration_call, + .priority = 10 +}; + +static int __init migration_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err; + + /* Start one for the boot CPU: */ + err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + BUG_ON(err == NOTIFY_BAD); + migration_call(&migration_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&migration_notifier); + + return err; +} +early_initcall(migration_init); +#endif + +#ifdef CONFIG_SMP + +#ifdef CONFIG_SCHED_DEBUG + +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, + struct cpumask *groupmask) +{ + struct sched_group *group = sd->groups; + char str[256]; + + cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); + cpumask_clear(groupmask); + + printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); + return -1; + } + + printk(KERN_CONT "span %s level %s\n", str, sd->name); + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); + } + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); + } + + printk(KERN_DEBUG "%*s groups:", level + 1, ""); + do { + if (!group) { + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); + break; + } + + if (!group->__cpu_power) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); + break; + } + + if (!cpumask_weight(sched_group_cpus(group))) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: empty group\n"); + break; + } + + if (cpumask_intersects(groupmask, sched_group_cpus(group))) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); + break; + } + + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + + cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); + printk(KERN_CONT " %s", str); + + group = group->next; + } while (group != sd->groups); + printk(KERN_CONT "\n"); + + if (!cpumask_equal(sched_domain_span(sd), groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); + return 0; +} + +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + cpumask_var_t groupmask; + int level = 0; + + if (!sd) { + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); + return; + } + + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + + if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { + printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); + return; + } + + for (;;) { + if (sched_domain_debug_one(sd, cpu, level, groupmask)) + break; + level++; + sd = sd->parent; + if (!sd) + break; + } + free_cpumask_var(groupmask); +} +#else /* !CONFIG_SCHED_DEBUG */ +# define sched_domain_debug(sd, cpu) do { } while (0) +#endif /* CONFIG_SCHED_DEBUG */ + +static int sd_degenerate(struct sched_domain *sd) +{ + if (cpumask_weight(sched_domain_span(sd)) == 1) + return 1; + + /* Following flags need at least 2 groups */ + if (sd->flags & (SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES)) { + if (sd->groups != sd->groups->next) + return 0; + } + + /* Following flags don't use groups */ + if (sd->flags & (SD_WAKE_IDLE | + SD_WAKE_AFFINE | + SD_WAKE_BALANCE)) + return 0; + + return 1; +} + +static int +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) +{ + unsigned long cflags = sd->flags, pflags = parent->flags; + + if (sd_degenerate(parent)) + return 1; + + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) + return 0; + + /* Does parent contain flags not in child? */ + /* WAKE_BALANCE is a subset of WAKE_AFFINE */ + if (cflags & SD_WAKE_AFFINE) + pflags &= ~SD_WAKE_BALANCE; + /* Flags needing groups don't count if only 1 group in parent */ + if (parent->groups == parent->groups->next) { + pflags &= ~(SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES); + if (nr_node_ids == 1) + pflags &= ~SD_SERIALIZE; + } + if (~cflags & pflags) + return 0; + + return 1; +} + +static void free_rootdomain(struct root_domain *rd) +{ + cpupri_cleanup(&rd->cpupri); + + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + +static void rq_attach_root(struct rq *rq, struct root_domain *rd) +{ + struct root_domain *old_rd = NULL; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + old_rd = rq->rd; + + if (cpumask_test_cpu(rq->cpu, old_rd->online)) + set_rq_offline(rq); + + cpumask_clear_cpu(rq->cpu, old_rd->span); + + /* + * If we dont want to free the old_rt yet then + * set old_rd to NULL to skip the freeing later + * in this function: + */ + if (!atomic_dec_and_test(&old_rd->refcount)) + old_rd = NULL; + } + + atomic_inc(&rd->refcount); + rq->rd = rd; + + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) + set_rq_online(rq); + + spin_unlock_irqrestore(&rq->lock, flags); + + if (old_rd) + free_rootdomain(old_rd); +} + +static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) +{ + memset(rd, 0, sizeof(*rd)); + + if (bootmem) { + alloc_bootmem_cpumask_var(&def_root_domain.span); + alloc_bootmem_cpumask_var(&def_root_domain.online); + alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); + cpupri_init(&rd->cpupri, true); + return 0; + } + + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; + + if (cpupri_init(&rd->cpupri, false) != 0) + goto free_rto_mask; + return 0; + +free_rto_mask: + free_cpumask_var(rd->rto_mask); +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +out: + return -ENOMEM; +} + +static void init_defrootdomain(void) +{ + init_rootdomain(&def_root_domain, true); + + atomic_set(&def_root_domain.refcount, 1); +} + +static struct root_domain *alloc_rootdomain(void) +{ + struct root_domain *rd; + + rd = kmalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return NULL; + + if (init_rootdomain(rd, false) != 0) { + kfree(rd); + return NULL; + } + + return rd; +} + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +static void +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct sched_domain *tmp; + + /* Remove the sched domains which do not contribute to scheduling. */ + for (tmp = sd; tmp; ) { + struct sched_domain *parent = tmp->parent; + if (!parent) + break; + + if (sd_parent_degenerate(tmp, parent)) { + tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + } else + tmp = tmp->parent; + } + + if (sd && sd_degenerate(sd)) { + sd = sd->parent; + if (sd) + sd->child = NULL; + } + + sched_domain_debug(sd, cpu); + + rq_attach_root(rq, rd); + rcu_assign_pointer(rq->sd, sd); +} + +/* cpus with isolated domains */ +static cpumask_var_t cpu_isolated_map; + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ + cpulist_parse(str, cpu_isolated_map); + return 1; +} + +__setup("isolcpus=", isolated_cpu_setup); + +/* + * init_sched_build_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). + * + * init_sched_build_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + */ +static void +init_sched_build_groups(const struct cpumask *span, + const struct cpumask *cpu_map, + int (*group_fn)(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *tmpmask), + struct cpumask *covered, struct cpumask *tmpmask) +{ + struct sched_group *first = NULL, *last = NULL; + int i; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct sched_group *sg; + int group = group_fn(i, cpu_map, &sg, tmpmask); + int j; + + if (cpumask_test_cpu(i, covered)) + continue; + + cpumask_clear(sched_group_cpus(sg)); + sg->__cpu_power = 0; + + for_each_cpu(j, span) { + if (group_fn(j, cpu_map, NULL, tmpmask) != group) + continue; + + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); + } + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; +} + +#define SD_NODES_PER_DOMAIN 16 + +#ifdef CONFIG_NUMA + +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int find_next_best_node(int node, nodemask_t *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < nr_node_ids; i++) { + /* Start at @node */ + n = (node + i) % nr_node_ids; + + if (!nr_cpus_node(n)) + continue; + + /* Skip already used nodes */ + if (node_isset(n, *used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + node_set(best_node, *used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @span: resulting cpumask + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static void sched_domain_node_span(int node, struct cpumask *span) +{ + nodemask_t used_nodes; + int i; + + cpumask_clear(span); + nodes_clear(used_nodes); + + cpumask_or(span, span, cpumask_of_node(node)); + node_set(node, used_nodes); + + for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { + int next_node = find_next_best_node(node, &used_nodes); + + cpumask_or(span, span, cpumask_of_node(next_node)); + } +} +#endif /* CONFIG_NUMA */ + +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; + +/* + * The cpus mask in sched_group and sched_domain hangs off the end. + * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space + * for nr_cpu_ids < CONFIG_NR_CPUS. + */ +struct static_sched_group { + struct sched_group sg; + DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); +}; + +struct static_sched_domain { + struct sched_domain sd; + DECLARE_BITMAP(span, CONFIG_NR_CPUS); +}; + +/* + * SMT sched-domains: + */ +#ifdef CONFIG_SCHED_SMT +static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); + +static int +cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) +{ + if (sg) + *sg = &per_cpu(sched_group_cpus, cpu).sg; + return cpu; +} +#endif /* CONFIG_SCHED_SMT */ + +/* + * multi-core sched-domains: + */ +#ifdef CONFIG_SCHED_MC +static DEFINE_PER_CPU(struct static_sched_domain, core_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); +#endif /* CONFIG_SCHED_MC */ + +#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) +static int +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) +{ + int group; + + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); + if (sg) + *sg = &per_cpu(sched_group_core, group).sg; + return group; +} +#elif defined(CONFIG_SCHED_MC) +static int +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) +{ + if (sg) + *sg = &per_cpu(sched_group_core, cpu).sg; + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); + +static int +cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) +{ + int group; +#ifdef CONFIG_SCHED_MC + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); +#else + group = cpu; +#endif + if (sg) + *sg = &per_cpu(sched_group_phys, group).sg; + return group; +} + +#ifdef CONFIG_NUMA +/* + * The init_sched_build_groups can't handle what we want to do with node + * groups, so roll our own. Now each node has its own list of groups which + * gets dynamically allocated. + */ +static DEFINE_PER_CPU(struct static_sched_domain, node_domains); +static struct sched_group ***sched_group_nodes_bycpu; + +static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); + +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *nodemask) +{ + int group; + + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); + group = cpumask_first(nodemask); + + if (sg) + *sg = &per_cpu(sched_group_allnodes, group).sg; + return group; +} + +static void init_numa_sched_groups_power(struct sched_group *group_head) +{ + struct sched_group *sg = group_head; + int j; + + if (!sg) + return; + do { + for_each_cpu(j, sched_group_cpus(sg)) { + struct sched_domain *sd; + + sd = &per_cpu(phys_domains, j).sd; + if (j != cpumask_first(sched_group_cpus(sd->groups))) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + + sg_inc_cpu_power(sg, sd->groups->__cpu_power); + } + sg = sg->next; + } while (sg != group_head); +} +#endif /* CONFIG_NUMA */ + +#ifdef CONFIG_NUMA +/* Free memory allocated for various sched_group structures */ +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) +{ + int cpu, i; + + for_each_cpu(cpu, cpu_map) { + struct sched_group **sched_group_nodes + = sched_group_nodes_bycpu[cpu]; + + if (!sched_group_nodes) + continue; + + for (i = 0; i < nr_node_ids; i++) { + struct sched_group *oldsg, *sg = sched_group_nodes[i]; + + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(nodemask)) + continue; + + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + } + kfree(sched_group_nodes); + sched_group_nodes_bycpu[cpu] = NULL; + } +} +#else /* !CONFIG_NUMA */ +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) +{ +} +#endif /* CONFIG_NUMA */ + +/* + * Initialize sched groups cpu_power. + * + * cpu_power indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_power for all the groups in a sched domain will be same unless + * there are asymmetries in the topology. If there are asymmetries, group + * having more cpu_power will pickup more load compared to the group having + * less cpu_power. + * + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents + * the maximum number of tasks a group can handle in the presence of other idle + * or lightly loaded groups in the same sched domain. + */ +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ + struct sched_domain *child; + struct sched_group *group; + + WARN_ON(!sd || !sd->groups); + + if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + return; + + child = sd->child; + + sd->groups->__cpu_power = 0; + + /* + * For perf policy, if the groups in child domain share resources + * (for example cores sharing some portions of the cache hierarchy + * or SMT), then set this domain groups cpu_power such that each group + * can handle only one task, when there are other idle groups in the + * same sched domain. + */ + if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && + (child->flags & + (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { + sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); + return; + } + + /* + * add cpu_power of each child group to this groups cpu_power + */ + group = child->groups; + do { + sg_inc_cpu_power(sd->groups, group->__cpu_power); + group = group->next; + } while (group != child->groups); +} + +/* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + +#define SD_INIT(sd, type) sd_init_##type(sd) + +#define SD_INIT_FUNC(type) \ +static noinline void sd_init_##type(struct sched_domain *sd) \ +{ \ + memset(sd, 0, sizeof(*sd)); \ + *sd = SD_##type##_INIT; \ + sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ +} + +SD_INIT_FUNC(CPU) +#ifdef CONFIG_NUMA + SD_INIT_FUNC(ALLNODES) + SD_INIT_FUNC(NODE) +#endif +#ifdef CONFIG_SCHED_SMT + SD_INIT_FUNC(SIBLING) +#endif +#ifdef CONFIG_SCHED_MC + SD_INIT_FUNC(MC) +#endif + +static int default_relax_domain_level = -1; + +static int __init setup_relax_domain_level(char *str) +{ + unsigned long val; + + val = simple_strtoul(str, NULL, 0); + if (val < SD_LV_MAX) + default_relax_domain_level = val; + + return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, + struct sched_domain_attr *attr) +{ + int request; + + if (!attr || attr->relax_domain_level < 0) { + if (default_relax_domain_level < 0) + return; + else + request = default_relax_domain_level; + } else + request = attr->relax_domain_level; + if (request < sd->level) { + /* turn off idle balance on this domain */ + sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + } else { + /* turn on idle balance on this domain */ + sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + } +} + +/* + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus + */ +static int __build_sched_domains(const struct cpumask *cpu_map, + struct sched_domain_attr *attr) +{ + int i, err = -ENOMEM; + struct root_domain *rd; + cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, + tmpmask; +#ifdef CONFIG_NUMA + cpumask_var_t domainspan, covered, notcovered; + struct sched_group **sched_group_nodes = NULL; + int sd_allnodes = 0; + + if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + goto free_domainspan; + if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + goto free_covered; +#endif + + if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + goto free_notcovered; + if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + goto free_nodemask; + if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + goto free_this_sibling_map; + if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + goto free_this_core_map; + if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + goto free_send_covered; + +#ifdef CONFIG_NUMA + /* + * Allocate the per-node list of sched groups + */ + sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), + GFP_KERNEL); + if (!sched_group_nodes) { + printk(KERN_WARNING "Can not alloc sched group node list\n"); + goto free_tmpmask; + } +#endif + + rd = alloc_rootdomain(); + if (!rd) { + printk(KERN_WARNING "Cannot alloc root domain\n"); + goto free_sched_groups; + } + +#ifdef CONFIG_NUMA + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; +#endif + + /* + * Set up domains for cpus specified by the cpu_map. + */ + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = NULL, *p; + + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); + +#ifdef CONFIG_NUMA + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { + sd = &per_cpu(allnodes_domains, i).sd; + SD_INIT(sd, ALLNODES); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), cpu_map); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); + p = sd; + sd_allnodes = 1; + } else + p = NULL; + + sd = &per_cpu(node_domains, i).sd; + SD_INIT(sd, NODE); + set_domain_attribute(sd, attr); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); + sd->parent = p; + if (p) + p->child = sd; + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); +#endif + + p = sd; + sd = &per_cpu(phys_domains, i).sd; + SD_INIT(sd, CPU); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), nodemask); + sd->parent = p; + if (p) + p->child = sd; + cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); + +#ifdef CONFIG_SCHED_MC + p = sd; + sd = &per_cpu(core_domains, i).sd; + SD_INIT(sd, MC); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, + cpu_coregroup_mask(i)); + sd->parent = p; + p->child = sd; + cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); +#endif + +#ifdef CONFIG_SCHED_SMT + p = sd; + sd = &per_cpu(cpu_domains, i).sd; + SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), + &per_cpu(cpu_sibling_map, i), cpu_map); + sd->parent = p; + p->child = sd; + cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); +#endif + } + +#ifdef CONFIG_SCHED_SMT + /* Set up CPU (sibling) groups */ + for_each_cpu(i, cpu_map) { + cpumask_and(this_sibling_map, + &per_cpu(cpu_sibling_map, i), cpu_map); + if (i != cpumask_first(this_sibling_map)) + continue; + + init_sched_build_groups(this_sibling_map, cpu_map, + &cpu_to_cpu_group, + send_covered, tmpmask); + } +#endif + +#ifdef CONFIG_SCHED_MC + /* Set up multi-core groups */ + for_each_cpu(i, cpu_map) { + cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); + if (i != cpumask_first(this_core_map)) + continue; + + init_sched_build_groups(this_core_map, cpu_map, + &cpu_to_core_group, + send_covered, tmpmask); + } +#endif + + /* Set up physical groups */ + for (i = 0; i < nr_node_ids; i++) { + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(nodemask)) + continue; + + init_sched_build_groups(nodemask, cpu_map, + &cpu_to_phys_group, + send_covered, tmpmask); + } + +#ifdef CONFIG_NUMA + /* Set up node groups */ + if (sd_allnodes) { + init_sched_build_groups(cpu_map, cpu_map, + &cpu_to_allnodes_group, + send_covered, tmpmask); + } + + for (i = 0; i < nr_node_ids; i++) { + /* Set up node groups */ + struct sched_group *sg, *prev; + int j; + + cpumask_clear(covered); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(nodemask)) { + sched_group_nodes[i] = NULL; + continue; + } + + sched_domain_node_span(i, domainspan); + cpumask_and(domainspan, domainspan, cpu_map); + + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, i); + if (!sg) { + printk(KERN_WARNING "Can not alloc domain group for " + "node %d\n", i); + goto error; + } + sched_group_nodes[i] = sg; + for_each_cpu(j, nodemask) { + struct sched_domain *sd; + + sd = &per_cpu(node_domains, j).sd; + sd->groups = sg; + } + sg->__cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), nodemask); + sg->next = sg; + cpumask_or(covered, covered, nodemask); + prev = sg; + + for (j = 0; j < nr_node_ids; j++) { + int n = (i + j) % nr_node_ids; + + cpumask_complement(notcovered, covered); + cpumask_and(tmpmask, notcovered, cpu_map); + cpumask_and(tmpmask, tmpmask, domainspan); + if (cpumask_empty(tmpmask)) + break; + + cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); + if (cpumask_empty(tmpmask)) + continue; + + sg = kmalloc_node(sizeof(struct sched_group) + + cpumask_size(), + GFP_KERNEL, i); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + goto error; + } + sg->__cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), tmpmask); + sg->next = prev->next; + cpumask_or(covered, covered, tmpmask); + prev->next = sg; + prev = sg; + } + } +#endif + + /* Calculate CPU power for physical packages and nodes */ +#ifdef CONFIG_SCHED_SMT + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; + + init_sched_groups_power(i, sd); + } +#endif +#ifdef CONFIG_SCHED_MC + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(core_domains, i).sd; + + init_sched_groups_power(i, sd); + } +#endif + + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(phys_domains, i).sd; + + init_sched_groups_power(i, sd); + } + +#ifdef CONFIG_NUMA + for (i = 0; i < nr_node_ids; i++) + init_numa_sched_groups_power(sched_group_nodes[i]); + + if (sd_allnodes) { + struct sched_group *sg; + + cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, + tmpmask); + init_numa_sched_groups_power(sg); + } +#endif + + /* Attach the domains */ + for_each_cpu(i, cpu_map) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i).sd; +#elif defined(CONFIG_SCHED_MC) + sd = &per_cpu(core_domains, i).sd; +#else + sd = &per_cpu(phys_domains, i).sd; +#endif + cpu_attach_domain(sd, rd, i); + } + + err = 0; + +free_tmpmask: + free_cpumask_var(tmpmask); +free_send_covered: + free_cpumask_var(send_covered); +free_this_core_map: + free_cpumask_var(this_core_map); +free_this_sibling_map: + free_cpumask_var(this_sibling_map); +free_nodemask: + free_cpumask_var(nodemask); +free_notcovered: +#ifdef CONFIG_NUMA + free_cpumask_var(notcovered); +free_covered: + free_cpumask_var(covered); +free_domainspan: + free_cpumask_var(domainspan); +out: +#endif + return err; + +free_sched_groups: +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + goto free_tmpmask; + +#ifdef CONFIG_NUMA +error: + free_sched_groups(cpu_map, tmpmask); + free_rootdomain(rd); + goto free_tmpmask; +#endif +} + +static int build_sched_domains(const struct cpumask *cpu_map) +{ + return __build_sched_domains(cpu_map, NULL); +} + +static struct cpumask *doms_cur; /* current sched domains */ +static int ndoms_cur; /* number of sched domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; + /* attribues of custom domains in 'doms_cur' */ + +/* + * Special case: If a kmalloc of a doms_cur partition (array of + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. + */ +static cpumask_var_t fallback_doms; + +/* + * arch_update_cpu_topology lets virtualized architectures update the + * cpu core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __attribute__((weak)) arch_update_cpu_topology(void) +{ + return 0; +} + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ +static int arch_init_sched_domains(const struct cpumask *cpu_map) +{ + int err; + + arch_update_cpu_topology(); + ndoms_cur = 1; + doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); + if (!doms_cur) + doms_cur = fallback_doms; + cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); + dattr_cur = NULL; + err = build_sched_domains(doms_cur); + register_sched_domain_sysctl(); + + return err; +} + +static void arch_destroy_sched_domains(const struct cpumask *cpu_map, + struct cpumask *tmpmask) +{ + free_sched_groups(cpu_map, tmpmask); +} + +/* + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain + */ +static void detach_destroy_domains(const struct cpumask *cpu_map) +{ + /* Save because hotplug lock held. */ + static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); + int i; + + for_each_cpu(i, cpu_map) + cpu_attach_domain(NULL, &def_root_domain, i); + synchronize_sched(); + arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); +} + +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, + struct sched_domain_attr *new, int idx_new) +{ + struct sched_domain_attr tmp; + + /* fast path */ + if (!new && !cur) + return 1; + + tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, + new ? (new + idx_new) : &tmp, + sizeof(struct sched_domain_attr)); +} + +/* + * Partition sched domains as specified by the 'ndoms_new' + * cpumasks in the array doms_new[] of cpumasks. This compares + * doms_new[] to the current sched domain partitioning, doms_cur[]. + * It destroys each deleted domain and builds each new domain. + * + * 'doms_new' is an array of cpumask's of length 'ndoms_new'. + * The masks don't intersect (don't overlap.) We should setup one + * sched domain for each mask. CPUs not in any of the cpumasks will + * not be load balanced. If the same cpumask appears both in the + * current 'doms_cur' domains and in the new 'doms_new', we can leave + * it as it is. + * + * The passed in 'doms_new' should be kmalloc'd. This routine takes + * ownership of it and will kfree it when done with it. If the caller + * failed the kmalloc call, then it can pass in doms_new == NULL && + * ndoms_new == 1, and partition_sched_domains() will fallback to + * the single partition 'fallback_doms', it also forces the domains + * to be rebuilt. + * + * If doms_new == NULL it will be replaced with cpu_online_mask. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. + * + * Call with hotplug lock held + */ +/* FIXME: Change to struct cpumask *doms_new[] */ +void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, + struct sched_domain_attr *dattr_new) +{ + int i, j, n; + int new_topology; + + mutex_lock(&sched_domains_mutex); + + /* always unregister in case we don't destroy any domains */ + unregister_sched_domain_sysctl(); + + /* Let architecture update cpu core mappings. */ + new_topology = arch_update_cpu_topology(); + + n = doms_new ? ndoms_new : 0; + + /* Destroy deleted domains */ + for (i = 0; i < ndoms_cur; i++) { + for (j = 0; j < n && !new_topology; j++) { + if (cpumask_equal(&doms_cur[i], &doms_new[j]) + && dattrs_equal(dattr_cur, i, dattr_new, j)) + goto match1; + } + /* no match - a current sched domain not in new doms_new[] */ + detach_destroy_domains(doms_cur + i); +match1: + ; + } + + if (doms_new == NULL) { + ndoms_cur = 0; + doms_new = fallback_doms; + cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); + WARN_ON_ONCE(dattr_new); + } + + /* Build new domains */ + for (i = 0; i < ndoms_new; i++) { + for (j = 0; j < ndoms_cur && !new_topology; j++) { + if (cpumask_equal(&doms_new[i], &doms_cur[j]) + && dattrs_equal(dattr_new, i, dattr_cur, j)) + goto match2; + } + /* no match - add a new doms_new */ + __build_sched_domains(doms_new + i, + dattr_new ? dattr_new + i : NULL); +match2: + ; + } + + /* Remember the new sched domains */ + if (doms_cur != fallback_doms) + kfree(doms_cur); + kfree(dattr_cur); /* kfree(NULL) is safe */ + doms_cur = doms_new; + dattr_cur = dattr_new; + ndoms_cur = ndoms_new; + + register_sched_domain_sysctl(); + + mutex_unlock(&sched_domains_mutex); +} + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +static void arch_reinit_sched_domains(void) +{ + get_online_cpus(); + + /* Destroy domains first to force the rebuild */ + partition_sched_domains(0, NULL, NULL); + + rebuild_sched_domains(); + put_online_cpus(); +} + +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) +{ + unsigned int level = 0; + + if (sscanf(buf, "%u", &level) != 1) + return -EINVAL; + + /* + * level is always be positive so don't check for + * level < POWERSAVINGS_BALANCE_NONE which is 0 + * What happens on 0 or 1 byte write, + * need to check for count as well? + */ + + if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) + return -EINVAL; + + if (smt) + sched_smt_power_savings = level; + else + sched_mc_power_savings = level; + + arch_reinit_sched_domains(); + + return count; +} + +#ifdef CONFIG_SCHED_MC +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + char *page) +{ + return sprintf(page, "%u\n", sched_mc_power_savings); +} +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, + const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 0); +} +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); +#endif + +#ifdef CONFIG_SCHED_SMT +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + char *page) +{ + return sprintf(page, "%u\n", sched_smt_power_savings); +} +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, + const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 1); +} +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, + sched_smt_power_savings_show, + sched_smt_power_savings_store); +#endif + +int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +{ + int err = 0; + +#ifdef CONFIG_SCHED_SMT + if (smt_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_smt_power_savings.attr); +#endif +#ifdef CONFIG_SCHED_MC + if (!err && mc_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_mc_power_savings.attr); +#endif + return err; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + +#ifndef CONFIG_CPUSETS +/* + * Add online and remove offline CPUs from the scheduler domains. + * When cpusets are enabled they take over this function. + */ +static int update_sched_domains(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + partition_sched_domains(1, NULL, NULL); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} +#endif + +static int update_runtime(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + disable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + enable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} + +void __init sched_init_smp(void) +{ + cpumask_var_t non_isolated_cpus; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); + +#if defined(CONFIG_NUMA) + sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), + GFP_KERNEL); + BUG_ON(sched_group_nodes_bycpu == NULL); +#endif + get_online_cpus(); + mutex_lock(&sched_domains_mutex); + arch_init_sched_domains(cpu_online_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); + mutex_unlock(&sched_domains_mutex); + put_online_cpus(); + +#ifndef CONFIG_CPUSETS + /* XXX: Theoretical race here - CPU may be hotplugged now */ + hotcpu_notifier(update_sched_domains, 0); +#endif + + /* RT runtime code needs to handle some hotplug events */ + hotcpu_notifier(update_runtime, 0); + + init_hrtick(); + + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) + BUG(); + sched_init_granularity(); + free_cpumask_var(non_isolated_cpus); + + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + init_sched_rt_class(); +} +#else +void __init sched_init_smp(void) +{ + sched_init_granularity(); +} +#endif /* CONFIG_SMP */ + +int in_sched_functions(unsigned long addr) +{ + return in_lock_functions(addr) || + (addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end); +} + +static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +{ + cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->rq = rq; +#endif + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +} + +static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + rt_rq->highest_prio = MAX_RT_PRIO; +#endif +#ifdef CONFIG_SMP + rt_rq->rt_nr_migratory = 0; + rt_rq->overloaded = 0; +#endif + + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + spin_lock_init(&rt_rq->rt_runtime_lock); + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; +#endif +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, int add, + struct sched_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + tg->cfs_rq[cpu] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + if (add) + list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + + tg->se[cpu] = se; + /* se could be NULL for init_task_group */ + if (!se) + return; + + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + + se->my_q = cfs_rq; + se->load.weight = tg->shares; + se->load.inv_weight = 0; + se->parent = parent; +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + tg->rt_rq[cpu] = rt_rq; + init_rt_rq(rt_rq, rq); + rt_rq->tg = tg; + rt_rq->rt_se = rt_se; + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + if (add) + list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); + + tg->rt_se[cpu] = rt_se; + if (!rt_se) + return; + + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + + rt_se->my_q = rt_rq; + rt_se->parent = parent; + INIT_LIST_HEAD(&rt_se->run_list); +} +#endif + +void __init sched_init(void) +{ + int i, j; + unsigned long alloc_size = 0, ptr; + +#ifdef CONFIG_FAIR_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_RT_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_USER_SCHED + alloc_size *= 2; +#endif + /* + * As sched_init() is called before page_alloc is setup, + * we use alloc_bootmem(). + */ + if (alloc_size) { + ptr = (unsigned long)alloc_bootmem(alloc_size); + +#ifdef CONFIG_FAIR_GROUP_SCHED + init_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED + init_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ + } + +#ifdef CONFIG_SMP + init_defrootdomain(); +#endif + + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED + init_rt_bandwidth(&init_task_group.rt_bandwidth, + global_rt_period(), global_rt_runtime()); +#ifdef CONFIG_USER_SCHED + init_rt_bandwidth(&root_task_group.rt_bandwidth, + global_rt_period(), RUNTIME_INF); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_GROUP_SCHED + list_add(&init_task_group.list, &task_groups); + INIT_LIST_HEAD(&init_task_group.children); + +#ifdef CONFIG_USER_SCHED + INIT_LIST_HEAD(&root_task_group.children); + init_task_group.parent = &root_task_group; + list_add(&init_task_group.siblings, &root_task_group.children); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_GROUP_SCHED */ + + for_each_possible_cpu(i) { + struct rq *rq; + + rq = cpu_rq(i); + spin_lock_init(&rq->lock); + rq->nr_running = 0; + init_cfs_rq(&rq->cfs, rq); + init_rt_rq(&rq->rt, rq); +#ifdef CONFIG_FAIR_GROUP_SCHED + init_task_group.shares = init_task_group_load; + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); +#ifdef CONFIG_CGROUP_SCHED + /* + * How much cpu bandwidth does init_task_group get? + * + * In case of task-groups formed thr' the cgroup filesystem, it + * gets 100% of the cpu resources in the system. This overall + * system cpu resource is divided among the tasks of + * init_task_group and its child task-groups in a fair manner, + * based on each entity's (task or task-group's) weight + * (se->load.weight). + * + * In other words, if init_task_group has 10 tasks of weight + * 1024) and two child groups A0 and A1 (of weight 1024 each), + * then A0's share of the cpu resource is: + * + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * + * We achieve this by letting init_task_group's tasks sit + * directly in rq->cfs (i.e init_task_group->se[] = NULL). + */ + init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + root_task_group.shares = NICE_0_LOAD; + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); + /* + * In case of task-groups formed thr' the user id of tasks, + * init_task_group represents tasks belonging to root user. + * Hence it forms a sibling of all subsequent groups formed. + * In this case, init_task_group gets only a fraction of overall + * system cpu resource, based on the weight assigned to root + * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished + * by letting tasks of init_task_group sit in a separate cfs_rq + * (init_cfs_rq) and having one entity represent this group of + * tasks in rq->cfs (i.e init_task_group->se[] != NULL). + */ + init_tg_cfs_entry(&init_task_group, + &per_cpu(init_cfs_rq, i), + &per_cpu(init_sched_entity, i), i, 1, + root_task_group.se[i]); + +#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; +#ifdef CONFIG_RT_GROUP_SCHED + INIT_LIST_HEAD(&rq->leaf_rt_rq_list); +#ifdef CONFIG_CGROUP_SCHED + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); + init_tg_rt_entry(&init_task_group, + &per_cpu(init_rt_rq, i), + &per_cpu(init_sched_rt_entity, i), i, 1, + root_task_group.rt_se[i]); +#endif +#endif + + for (j = 0; j < CPU_LOAD_IDX_MAX; j++) + rq->cpu_load[j] = 0; +#ifdef CONFIG_SMP + rq->sd = NULL; + rq->rd = NULL; + rq->active_balance = 0; + rq->next_balance = jiffies; + rq->push_cpu = 0; + rq->cpu = i; + rq->online = 0; + rq->migration_thread = NULL; + INIT_LIST_HEAD(&rq->migration_queue); + rq_attach_root(rq, &def_root_domain); +#endif + init_rq_hrtick(rq); + atomic_set(&rq->nr_iowait, 0); + } + + set_load_weight(&init_task); + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); +#endif + +#ifdef CONFIG_RT_MUTEXES + plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); +#endif + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, smp_processor_id()); + /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; + + /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ + alloc_bootmem_cpumask_var(&nohz_cpu_mask); +#ifdef CONFIG_SMP +#ifdef CONFIG_NO_HZ + alloc_bootmem_cpumask_var(&nohz.cpu_mask); +#endif + alloc_bootmem_cpumask_var(&cpu_isolated_map); +#endif /* SMP */ + + scheduler_running = 1; +} + +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +void __might_sleep(char *file, int line) +{ +#ifdef in_atomic + static unsigned long prev_jiffy; /* ratelimiting */ + + if ((!in_atomic() && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); +#endif +} +EXPORT_SYMBOL(__might_sleep); +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +static void normalize_task(struct rq *rq, struct task_struct *p) +{ + int on_rq; + + update_rq_clock(rq); + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq, p, 0); + __setscheduler(rq, p, SCHED_NORMAL, 0); + if (on_rq) { + activate_task(rq, p, 0); + resched_task(rq->curr); + } +} + +void normalize_rt_tasks(void) +{ + struct task_struct *g, *p; + unsigned long flags; + struct rq *rq; + + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, p) { + /* + * Only normalize user tasks: + */ + if (!p->mm) + continue; + + p->se.exec_start = 0; +#ifdef CONFIG_SCHEDSTATS + p->se.wait_start = 0; + p->se.sleep_start = 0; + p->se.block_start = 0; +#endif + + if (!rt_task(p)) { + /* + * Renice negative nice level userspace + * tasks back to 0: + */ + if (TASK_NICE(p) < 0 && p->mm) + set_user_nice(p, 0); + continue; + } + + spin_lock(&p->pi_lock); + rq = __task_rq_lock(p); + + normalize_task(rq, p); + + __task_rq_unlock(rq); + spin_unlock(&p->pi_lock); + } while_each_thread(g, p); + + read_unlock_irqrestore(&tasklist_lock, flags); +} + +#endif /* CONFIG_MAGIC_SYSRQ */ + +#ifdef CONFIG_IA64 +/* + * These functions are only useful for the IA64 MCA handling. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +struct task_struct *curr_task(int cpu) +{ + return cpu_curr(cpu); +} + +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronized, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, struct task_struct *p) +{ + cpu_curr(cpu) = p; +} + +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void free_fair_sched_group(struct task_group *tg) +{ + int i; + + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + } + + kfree(tg->cfs_rq); + kfree(tg->se); +} + +static +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se; + struct rq *rq; + int i; + + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->se) + goto err; + + tg->shares = NICE_0_LOAD; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!cfs_rq) + goto err; + + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!se) + goto err; + + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); + } + + return 1; + + err: + return 0; +} + +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, + &cpu_rq(cpu)->leaf_cfs_rq_list); +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); +} +#else /* !CONFG_FAIR_GROUP_SCHED */ +static inline void free_fair_sched_group(struct task_group *tg) +{ +} + +static inline +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} + +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg) +{ + int i; + + destroy_rt_bandwidth(&tg->rt_bandwidth); + + for_each_possible_cpu(i) { + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); + } + + kfree(tg->rt_rq); + kfree(tg->rt_se); +} + +static +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; + struct rq *rq; + int i; + + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_rq) + goto err; + + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_se) + goto err; + + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); + } + + return 1; + + err: + return 0; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, + &cpu_rq(cpu)->leaf_rt_rq_list); +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); +} +#else /* !CONFIG_RT_GROUP_SCHED */ +static inline void free_rt_sched_group(struct task_group *tg) +{ +} + +static inline +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_GROUP_SCHED +static void free_sched_group(struct task_group *tg) +{ + free_fair_sched_group(tg); + free_rt_sched_group(tg); + kfree(tg); +} + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(struct task_group *parent) +{ + struct task_group *tg; + unsigned long flags; + int i; + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + if (!alloc_fair_sched_group(tg, parent)) + goto err; + + if (!alloc_rt_sched_group(tg, parent)) + goto err; + + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) { + register_fair_sched_group(tg, i); + register_rt_sched_group(tg, i); + } + list_add_rcu(&tg->list, &task_groups); + + WARN_ON(!parent); /* root should already exist */ + + tg->parent = parent; + INIT_LIST_HEAD(&tg->children); + list_add_rcu(&tg->siblings, &parent->children); + spin_unlock_irqrestore(&task_group_lock, flags); + + return tg; + +err: + free_sched_group(tg); + return ERR_PTR(-ENOMEM); +} + +/* rcu callback to free various structures associated with a task group */ +static void free_sched_group_rcu(struct rcu_head *rhp) +{ + /* now it should be safe to free those cfs_rqs */ + free_sched_group(container_of(rhp, struct task_group, rcu)); +} + +/* Destroy runqueue etc associated with a task group */ +void sched_destroy_group(struct task_group *tg) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) { + unregister_fair_sched_group(tg, i); + unregister_rt_sched_group(tg, i); + } + list_del_rcu(&tg->list); + list_del_rcu(&tg->siblings); + spin_unlock_irqrestore(&task_group_lock, flags); + + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, free_sched_group_rcu); +} + +/* change task's runqueue when it moves between groups. + * The caller of this function should have put the task in its new group + * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to + * reflect its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int on_rq, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + update_rq_clock(rq); + + running = task_current(rq, tsk); + on_rq = tsk->se.on_rq; + + if (on_rq) + dequeue_task(rq, tsk, 0); + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); + + set_task_rq(tsk, task_cpu(tsk)); + +#ifdef CONFIG_FAIR_GROUP_SCHED + if (tsk->sched_class->moved_group) + tsk->sched_class->moved_group(tsk); +#endif + + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + if (on_rq) + enqueue_task(rq, tsk, 0); + + task_rq_unlock(rq, &flags); +} +#endif /* CONFIG_GROUP_SCHED */ + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void __set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + int on_rq; + + on_rq = se->on_rq; + if (on_rq) + dequeue_entity(cfs_rq, se, 0); + + se->load.weight = shares; + se->load.inv_weight = 0; + + if (on_rq) + enqueue_entity(cfs_rq, se, 0); +} + +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __set_se_shares(se, shares); + spin_unlock_irqrestore(&rq->lock, flags); +} + +static DEFINE_MUTEX(shares_mutex); + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int i; + unsigned long flags; + + /* + * We can't change the weight of the root cgroup. + */ + if (!tg->se[0]) + return -EINVAL; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; + + mutex_lock(&shares_mutex); + if (tg->shares == shares) + goto done; + + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) + unregister_fair_sched_group(tg, i); + list_del_rcu(&tg->siblings); + spin_unlock_irqrestore(&task_group_lock, flags); + + /* wait for any ongoing reference to this group to finish */ + synchronize_sched(); + + /* + * Now we are free to modify the group's share on each cpu + * w/o tripping rebalance_share or load_balance_fair. + */ + tg->shares = shares; + for_each_possible_cpu(i) { + /* + * force a rebalance + */ + cfs_rq_set_shares(tg->cfs_rq[i], 0); + set_se_shares(tg->se[i], shares); + } + + /* + * Enable load balance activity on this group, by inserting it back on + * each cpu's rq->leaf_cfs_rq_list. + */ + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) + register_fair_sched_group(tg, i); + list_add_rcu(&tg->siblings, &tg->parent->children); + spin_unlock_irqrestore(&task_group_lock, flags); +done: + mutex_unlock(&shares_mutex); + return 0; +} + +unsigned long sched_group_shares(struct task_group *tg) +{ + return tg->shares; +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); + +static unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 20; + + return div64_u64(runtime << 20, period); +} + +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) +{ + struct task_struct *g, *p; + + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); + + return 0; +} + +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; + +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; + + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + +#ifdef CONFIG_USER_SCHED + if (tg == &root_task_group) { + period = global_rt_period(); + runtime = global_rt_runtime(); + } +#endif + + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); + } + + if (sum > total) + return -EINVAL; + + return 0; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); +} + +static int tg_set_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) +{ + int i, err = 0; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) + goto unlock; + + spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); + unlock: + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return err; +} + +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_bandwidth.rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} + +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + if (rt_period == 0) + return -EINVAL; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + u64 runtime, period; + int ret = 0; + + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return ret; +} + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + +#else /* !CONFIG_RT_GROUP_SCHED */ +static int sched_rt_global_constraints(void) +{ + unsigned long flags; + int i; + + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + + return 0; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +int sched_rt_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_constraints(); + if (ret) { + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } else { + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = + ns_to_ktime(global_rt_period()); + } + } + mutex_unlock(&mutex); + + return ret; +} + +#ifdef CONFIG_CGROUP_SCHED + +/* return corresponding task_group object of a cgroup */ +static inline struct task_group *cgroup_tg(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), + struct task_group, css); +} + +static struct cgroup_subsys_state * +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct task_group *tg, *parent; + + if (!cgrp->parent) { + /* This is early initialization for the top cgroup */ + return &init_task_group.css; + } + + parent = cgroup_tg(cgrp->parent); + tg = sched_create_group(parent); + if (IS_ERR(tg)) + return ERR_PTR(-ENOMEM); + + return &tg->css; +} + +static void +cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + + sched_destroy_group(tg); +} + +static int +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk) +{ +#ifdef CONFIG_RT_GROUP_SCHED + if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) + return -EINVAL; +#else + /* We don't support RT-tasks being in separate groups */ + if (tsk->sched_class != &fair_sched_class) + return -EINVAL; +#endif + + return 0; +} + +static void +cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cont, struct task_struct *tsk) +{ + sched_move_task(tsk); +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 shareval) +{ + return sched_group_set_shares(cgroup_tg(cgrp), shareval); +} + +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cgrp); + + return (u64) tg->shares; +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, + s64 val) +{ + return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); +} + +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_runtime(cgroup_tg(cgrp)); +} + +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_period_us) +{ + return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} + +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_period(cgroup_tg(cgrp)); +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "shares", + .read_u64 = cpu_shares_read_u64, + .write_u64 = cpu_shares_write_u64, + }, +#endif +#ifdef CONFIG_RT_GROUP_SCHED + { + .name = "rt_runtime_us", + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, + }, + { + .name = "rt_period_us", + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, + }, +#endif +}; + +static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); +} + +struct cgroup_subsys cpu_cgroup_subsys = { + .name = "cpu", + .create = cpu_cgroup_create, + .destroy = cpu_cgroup_destroy, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, + .populate = cpu_cgroup_populate, + .subsys_id = cpu_cgroup_subsys_id, + .early_init = 1, +}; + +#endif /* CONFIG_CGROUP_SCHED */ + +#ifdef CONFIG_CGROUP_CPUACCT + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 *cpuusage; + struct cpuacct *parent; +}; + +struct cgroup_subsys cpuacct_subsys; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_create( + struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + + if (!ca) + return ERR_PTR(-ENOMEM); + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) { + kfree(ca); + return ERR_PTR(-ENOMEM); + } + + if (cgrp->parent) + ca->parent = cgroup_ca(cgrp->parent); + + return &ca->css; +} + +/* destroy an existing cpu accounting group */ +static void +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + +}; + +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); +} + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +static void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + if (!cpuacct_subsys.active) + return; + + cpu = task_cpu(tsk); + ca = task_ca(tsk); + + for (; ca; ca = ca->parent) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + } +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .create = cpuacct_create, + .destroy = cpuacct_destroy, + .populate = cpuacct_populate, + .subsys_id = cpuacct_subsys_id, +}; +#endif /* CONFIG_CGROUP_CPUACCT */ +#endif /* !DDE_LINUX */ diff --git a/libdde-linux26/lib/src/kernel/sched_cpupri.h b/libdde-linux26/lib/src/kernel/sched_cpupri.h new file mode 100644 index 00000000..642a94ef --- /dev/null +++ b/libdde-linux26/lib/src/kernel/sched_cpupri.h @@ -0,0 +1,37 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include <linux/sched.h> + +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + spinlock_t lock; + int count; + cpumask_var_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + long pri_active[CPUPRI_NR_PRI_WORDS]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, cpumask_t *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +int cpupri_init(struct cpupri *cp, bool bootmem); +void cpupri_cleanup(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ diff --git a/libdde-linux26/lib/src/kernel/sys.c b/libdde-linux26/lib/src/kernel/sys.c new file mode 100644 index 00000000..6533cb97 --- /dev/null +++ b/libdde-linux26/lib/src/kernel/sys.c @@ -0,0 +1,1893 @@ +/* + * linux/kernel/sys.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/utsname.h> +#include <linux/mman.h> +#include <linux/smp_lock.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/prctl.h> +#include <linux/highuid.h> +#include <linux/fs.h> +#include <linux/resource.h> +#include <linux/kernel.h> +#include <linux/kexec.h> +#include <linux/workqueue.h> +#include <linux/capability.h> +#include <linux/device.h> +#include <linux/key.h> +#include <linux/times.h> +#include <linux/posix-timers.h> +#include <linux/security.h> +#include <linux/dcookies.h> +#include <linux/suspend.h> +#include <linux/tty.h> +#include <linux/signal.h> +#include <linux/cn_proc.h> +#include <linux/getcpu.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/seccomp.h> +#include <linux/cpu.h> +#include <linux/ptrace.h> + +#include <linux/compat.h> +#include <linux/syscalls.h> +#include <linux/kprobes.h> +#include <linux/user_namespace.h> + +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/unistd.h> + +#ifndef SET_UNALIGN_CTL +# define SET_UNALIGN_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_UNALIGN_CTL +# define GET_UNALIGN_CTL(a,b) (-EINVAL) +#endif +#ifndef SET_FPEMU_CTL +# define SET_FPEMU_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_FPEMU_CTL +# define GET_FPEMU_CTL(a,b) (-EINVAL) +#endif +#ifndef SET_FPEXC_CTL +# define SET_FPEXC_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_FPEXC_CTL +# define GET_FPEXC_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_ENDIAN +# define GET_ENDIAN(a,b) (-EINVAL) +#endif +#ifndef SET_ENDIAN +# define SET_ENDIAN(a,b) (-EINVAL) +#endif +#ifndef GET_TSC_CTL +# define GET_TSC_CTL(a) (-EINVAL) +#endif +#ifndef SET_TSC_CTL +# define SET_TSC_CTL(a) (-EINVAL) +#endif + +#ifndef DDE_LINUX +/* + * this is where the system-wide overflow UID and GID are defined, for + * architectures that now have 32-bit UID/GID but didn't in the past + */ + +int overflowuid = DEFAULT_OVERFLOWUID; +int overflowgid = DEFAULT_OVERFLOWGID; + +#ifdef CONFIG_UID16 +EXPORT_SYMBOL(overflowuid); +EXPORT_SYMBOL(overflowgid); +#endif + +/* + * the same as above, but for filesystems which can only store a 16-bit + * UID and GID. as such, this is needed on all architectures + */ + +int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; +int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; + +EXPORT_SYMBOL(fs_overflowuid); +EXPORT_SYMBOL(fs_overflowgid); + +/* + * this indicates whether you can reboot with ctrl-alt-del: the default is yes + */ + +int C_A_D = 1; +#endif /* DDE_LINUX */ +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); + +/* + * If set, this is used for preparing the system to power off. + */ + +void (*pm_power_off_prepare)(void); + +#ifndef DDE_LINUX +/* + * set the priority of a task + * - the caller must hold the RCU read lock + */ +static int set_one_prio(struct task_struct *p, int niceval, int error) +{ + const struct cred *cred = current_cred(), *pcred = __task_cred(p); + int no_nice; + + if (pcred->uid != cred->euid && + pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { + error = -EPERM; + goto out; + } + if (niceval < task_nice(p) && !can_nice(p, niceval)) { + error = -EACCES; + goto out; + } + no_nice = security_task_setnice(p, niceval); + if (no_nice) { + error = no_nice; + goto out; + } + if (error == -ESRCH) + error = 0; + set_user_nice(p, niceval); +out: + return error; +} + +SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) +{ + struct task_struct *g, *p; + struct user_struct *user; + const struct cred *cred = current_cred(); + int error = -EINVAL; + struct pid *pgrp; + + if (which > PRIO_USER || which < PRIO_PROCESS) + goto out; + + /* normalize: avoid signed division (rounding problems) */ + error = -ESRCH; + if (niceval < -20) + niceval = -20; + if (niceval > 19) + niceval = 19; + + read_lock(&tasklist_lock); + switch (which) { + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + error = set_one_prio(p, niceval, error); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = (struct user_struct *) cred->user; + if (!who) + who = cred->uid; + else if ((who != cred->uid) && + !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + + do_each_thread(g, p) + if (__task_cred(p)->uid == who) + error = set_one_prio(p, niceval, error); + while_each_thread(g, p); + if (who != cred->uid) + free_uid(user); /* For find_user() */ + break; + } +out_unlock: + read_unlock(&tasklist_lock); +out: + return error; +} + +/* + * Ugh. To avoid negative return values, "getpriority()" will + * not return the normal nice-value, but a negated value that + * has been offset by 20 (ie it returns 40..1 instead of -20..19) + * to stay compatible. + */ +SYSCALL_DEFINE2(getpriority, int, which, int, who) +{ + struct task_struct *g, *p; + struct user_struct *user; + const struct cred *cred = current_cred(); + long niceval, retval = -ESRCH; + struct pid *pgrp; + + if (which > PRIO_USER || which < PRIO_PROCESS) + return -EINVAL; + + read_lock(&tasklist_lock); + switch (which) { + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = (struct user_struct *) cred->user; + if (!who) + who = cred->uid; + else if ((who != cred->uid) && + !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + + do_each_thread(g, p) + if (__task_cred(p)->uid == who) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } + while_each_thread(g, p); + if (who != cred->uid) + free_uid(user); /* for find_user() */ + break; + } +out_unlock: + read_unlock(&tasklist_lock); + + return retval; +} + +/** + * emergency_restart - reboot the system + * + * Without shutting down any hardware or taking any locks + * reboot the system. This is called when we know we are in + * trouble so this is our best effort to reboot. This is + * safe to call in interrupt context. + */ +void emergency_restart(void) +{ + machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + +void kernel_restart_prepare(char *cmd) +{ + blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + system_state = SYSTEM_RESTART; + device_shutdown(); + sysdev_shutdown(); +} + +/** + * kernel_restart - reboot the system + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * + * Shutdown everything and perform a clean reboot. + * This is not safe to call in interrupt context. + */ +void kernel_restart(char *cmd) +{ + kernel_restart_prepare(cmd); + if (!cmd) + printk(KERN_EMERG "Restarting system.\n"); + else + printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); + +static void kernel_shutdown_prepare(enum system_states state) +{ + blocking_notifier_call_chain(&reboot_notifier_list, + (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + system_state = state; + device_shutdown(); +} +/** + * kernel_halt - halt the system + * + * Shutdown everything and perform a clean system halt. + */ +void kernel_halt(void) +{ + kernel_shutdown_prepare(SYSTEM_HALT); + sysdev_shutdown(); + printk(KERN_EMERG "System halted.\n"); + machine_halt(); +} + +EXPORT_SYMBOL_GPL(kernel_halt); + +/** + * kernel_power_off - power_off the system + * + * Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off(void) +{ + kernel_shutdown_prepare(SYSTEM_POWER_OFF); + if (pm_power_off_prepare) + pm_power_off_prepare(); + disable_nonboot_cpus(); + sysdev_shutdown(); + printk(KERN_EMERG "Power down.\n"); + machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); +/* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers + * so that some mistake won't make this reboot the whole machine. + * You can also set the meaning of the ctrl-alt-del-key here. + * + * reboot doesn't sync: do that yourself before calling this. + */ +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, + void __user *, arg) +{ + char buffer[256]; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + /* For safety, we require "magic" arguments. */ + if (magic1 != LINUX_REBOOT_MAGIC1 || + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && + magic2 != LINUX_REBOOT_MAGIC2B && + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + + lock_kernel(); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + kernel_restart(NULL); + break; + + case LINUX_REBOOT_CMD_CAD_ON: + C_A_D = 1; + break; + + case LINUX_REBOOT_CMD_CAD_OFF: + C_A_D = 0; + break; + + case LINUX_REBOOT_CMD_HALT: + kernel_halt(); + unlock_kernel(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_POWER_OFF: + kernel_power_off(); + unlock_kernel(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_RESTART2: + if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + unlock_kernel(); + return -EFAULT; + } + buffer[sizeof(buffer) - 1] = '\0'; + + kernel_restart(buffer); + break; + +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + { + int ret; + ret = kernel_kexec(); + unlock_kernel(); + return ret; + } +#endif + +#ifdef CONFIG_HIBERNATION + case LINUX_REBOOT_CMD_SW_SUSPEND: + { + int ret = hibernate(); + unlock_kernel(); + return ret; + } +#endif + + default: + unlock_kernel(); + return -EINVAL; + } + unlock_kernel(); + return 0; +} + +static void deferred_cad(struct work_struct *dummy) +{ + kernel_restart(NULL); +} + +/* + * This function gets called by ctrl-alt-del - ie the keyboard interrupt. + * As it's called within an interrupt, it may NOT sync: the only choice + * is whether to reboot at once, or just ignore the ctrl-alt-del. + */ +void ctrl_alt_del(void) +{ + static DECLARE_WORK(cad_work, deferred_cad); + + if (C_A_D) + schedule_work(&cad_work); + else + kill_cad_pid(SIGINT, 1); +} + +/* + * Unprivileged users may change the real gid to the effective gid + * or vice versa. (BSD-style) + * + * If you set the real gid at all, or set the effective gid to a value not + * equal to the real gid, then the saved gid is set to the new effective gid. + * + * This makes it possible for a setgid program to completely drop its + * privileges, which is often a useful assertion to make when you are doing + * a security audit over a program. + * + * The general idea is that a program which uses just setregid() will be + * 100% compatible with BSD. A program which uses just setgid() will be + * 100% compatible with POSIX with saved IDs. + * + * SMP: There are not races, the GIDs are checked only by filesystem + * operations (as far as semantic preservation is concerned). + */ +SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); + if (retval) + goto error; + + retval = -EPERM; + if (rgid != (gid_t) -1) { + if (old->gid == rgid || + old->egid == rgid || + capable(CAP_SETGID)) + new->gid = rgid; + else + goto error; + } + if (egid != (gid_t) -1) { + if (old->gid == egid || + old->egid == egid || + old->sgid == egid || + capable(CAP_SETGID)) + new->egid = egid; + else + goto error; + } + + if (rgid != (gid_t) -1 || + (egid != (gid_t) -1 && egid != old->gid)) + new->sgid = new->egid; + new->fsgid = new->egid; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +/* + * setgid() is implemented like SysV w/ SAVED_IDS + * + * SMP: Same implicit races as above. + */ +SYSCALL_DEFINE1(setgid, gid_t, gid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); + if (retval) + goto error; + + retval = -EPERM; + if (capable(CAP_SETGID)) + new->gid = new->egid = new->sgid = new->fsgid = gid; + else if (gid == old->gid || gid == old->sgid) + new->egid = new->fsgid = gid; + else + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +/* + * change the user struct in a credentials set to match the new UID + */ +static int set_user(struct cred *new) +{ + struct user_struct *new_user; + + new_user = alloc_uid(current_user_ns(), new->uid); + if (!new_user) + return -EAGAIN; + + if (!task_can_switch_user(new_user, current)) { + free_uid(new_user); + return -EINVAL; + } + + if (atomic_read(&new_user->processes) >= + current->signal->rlim[RLIMIT_NPROC].rlim_cur && + new_user != INIT_USER) { + free_uid(new_user); + return -EAGAIN; + } + + free_uid(new->user); + new->user = new_user; + return 0; +} + +/* + * Unprivileged users may change the real uid to the effective uid + * or vice versa. (BSD-style) + * + * If you set the real uid at all, or set the effective uid to a value not + * equal to the real uid, then the saved uid is set to the new effective uid. + * + * This makes it possible for a setuid program to completely drop its + * privileges, which is often a useful assertion to make when you are doing + * a security audit over a program. + * + * The general idea is that a program which uses just setreuid() will be + * 100% compatible with BSD. A program which uses just setuid() will be + * 100% compatible with POSIX with saved IDs. + */ +SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); + if (retval) + goto error; + + retval = -EPERM; + if (ruid != (uid_t) -1) { + new->uid = ruid; + if (old->uid != ruid && + old->euid != ruid && + !capable(CAP_SETUID)) + goto error; + } + + if (euid != (uid_t) -1) { + new->euid = euid; + if (old->uid != euid && + old->euid != euid && + old->suid != euid && + !capable(CAP_SETUID)) + goto error; + } + + if (new->uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } + if (ruid != (uid_t) -1 || + (euid != (uid_t) -1 && euid != old->uid)) + new->suid = new->euid; + new->fsuid = new->euid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_RE); + if (retval < 0) + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +/* + * setuid() is implemented like SysV with SAVED_IDS + * + * Note that SAVED_ID's is deficient in that a setuid root program + * like sendmail, for example, cannot set its uid to be a normal + * user and then switch back, because if you're root, setuid() sets + * the saved uid too. If you don't like this, blame the bright people + * in the POSIX committee and/or USG. Note that the BSD-style setreuid() + * will allow a root program to temporarily drop privileges and be able to + * regain them by swapping the real and effective uid. + */ +SYSCALL_DEFINE1(setuid, uid_t, uid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); + if (retval) + goto error; + + retval = -EPERM; + if (capable(CAP_SETUID)) { + new->suid = new->uid = uid; + if (uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } + } else if (uid != old->uid && uid != new->suid) { + goto error; + } + + new->fsuid = new->euid = uid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_ID); + if (retval < 0) + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + + +/* + * This function implements a generic ability to update ruid, euid, + * and suid. This allows you to implement the 4.4 compatible seteuid(). + */ +SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); + if (retval) + goto error; + old = current_cred(); + + retval = -EPERM; + if (!capable(CAP_SETUID)) { + if (ruid != (uid_t) -1 && ruid != old->uid && + ruid != old->euid && ruid != old->suid) + goto error; + if (euid != (uid_t) -1 && euid != old->uid && + euid != old->euid && euid != old->suid) + goto error; + if (suid != (uid_t) -1 && suid != old->uid && + suid != old->euid && suid != old->suid) + goto error; + } + + if (ruid != (uid_t) -1) { + new->uid = ruid; + if (ruid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } + } + if (euid != (uid_t) -1) + new->euid = euid; + if (suid != (uid_t) -1) + new->suid = suid; + new->fsuid = new->euid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_RES); + if (retval < 0) + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) +{ + const struct cred *cred = current_cred(); + int retval; + + if (!(retval = put_user(cred->uid, ruid)) && + !(retval = put_user(cred->euid, euid))) + retval = put_user(cred->suid, suid); + + return retval; +} + +/* + * Same as above, but for rgid, egid, sgid. + */ +SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); + if (retval) + goto error; + + retval = -EPERM; + if (!capable(CAP_SETGID)) { + if (rgid != (gid_t) -1 && rgid != old->gid && + rgid != old->egid && rgid != old->sgid) + goto error; + if (egid != (gid_t) -1 && egid != old->gid && + egid != old->egid && egid != old->sgid) + goto error; + if (sgid != (gid_t) -1 && sgid != old->gid && + sgid != old->egid && sgid != old->sgid) + goto error; + } + + if (rgid != (gid_t) -1) + new->gid = rgid; + if (egid != (gid_t) -1) + new->egid = egid; + if (sgid != (gid_t) -1) + new->sgid = sgid; + new->fsgid = new->egid; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) +{ + const struct cred *cred = current_cred(); + int retval; + + if (!(retval = put_user(cred->gid, rgid)) && + !(retval = put_user(cred->egid, egid))) + retval = put_user(cred->sgid, sgid); + + return retval; +} + + +/* + * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This + * is used for "access()" and for the NFS daemon (letting nfsd stay at + * whatever uid it wants to). It normally shadows "euid", except when + * explicitly set by setfsuid() or for access.. + */ +SYSCALL_DEFINE1(setfsuid, uid_t, uid) +{ + const struct cred *old; + struct cred *new; + uid_t old_fsuid; + + new = prepare_creds(); + if (!new) + return current_fsuid(); + old = current_cred(); + old_fsuid = old->fsuid; + + if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0) + goto error; + + if (uid == old->uid || uid == old->euid || + uid == old->suid || uid == old->fsuid || + capable(CAP_SETUID)) { + if (uid != old_fsuid) { + new->fsuid = uid; + if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) + goto change_okay; + } + } + +error: + abort_creds(new); + return old_fsuid; + +change_okay: + commit_creds(new); + return old_fsuid; +} + +/* + * Samma på svenska.. + */ +SYSCALL_DEFINE1(setfsgid, gid_t, gid) +{ + const struct cred *old; + struct cred *new; + gid_t old_fsgid; + + new = prepare_creds(); + if (!new) + return current_fsgid(); + old = current_cred(); + old_fsgid = old->fsgid; + + if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) + goto error; + + if (gid == old->gid || gid == old->egid || + gid == old->sgid || gid == old->fsgid || + capable(CAP_SETGID)) { + if (gid != old_fsgid) { + new->fsgid = gid; + goto change_okay; + } + } + +error: + abort_creds(new); + return old_fsgid; + +change_okay: + commit_creds(new); + return old_fsgid; +} + +void do_sys_times(struct tms *tms) +{ + struct task_cputime cputime; + cputime_t cutime, cstime; + + thread_group_cputime(current, &cputime); + spin_lock_irq(¤t->sighand->siglock); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + spin_unlock_irq(¤t->sighand->siglock); + tms->tms_utime = cputime_to_clock_t(cputime.utime); + tms->tms_stime = cputime_to_clock_t(cputime.stime); + tms->tms_cutime = cputime_to_clock_t(cutime); + tms->tms_cstime = cputime_to_clock_t(cstime); +} + +SYSCALL_DEFINE1(times, struct tms __user *, tbuf) +{ + if (tbuf) { + struct tms tmp; + + do_sys_times(&tmp); + if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) + return -EFAULT; + } + force_successful_syscall_return(); + return (long) jiffies_64_to_clock_t(get_jiffies_64()); +} + +/* + * This needs some heavy checking ... + * I just haven't the stomach for it. I also don't fully + * understand sessions/pgrp etc. Let somebody who does explain it. + * + * OK, I think I have the protection semantics right.... this is really + * only important on a multi-user system anyway, to make sure one user + * can't send a signal to a process owned by another. -TYT, 12/12/91 + * + * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. + * LBT 04.03.94 + */ +SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) +{ + struct task_struct *p; + struct task_struct *group_leader = current->group_leader; + struct pid *pgrp; + int err; + + if (!pid) + pid = task_pid_vnr(group_leader); + if (!pgid) + pgid = pid; + if (pgid < 0) + return -EINVAL; + + /* From this point forward we keep holding onto the tasklist lock + * so that our parent does not change from under us. -DaveM + */ + write_lock_irq(&tasklist_lock); + + err = -ESRCH; + p = find_task_by_vpid(pid); + if (!p) + goto out; + + err = -EINVAL; + if (!thread_group_leader(p)) + goto out; + + if (same_thread_group(p->real_parent, group_leader)) { + err = -EPERM; + if (task_session(p) != task_session(group_leader)) + goto out; + err = -EACCES; + if (p->did_exec) + goto out; + } else { + err = -ESRCH; + if (p != group_leader) + goto out; + } + + err = -EPERM; + if (p->signal->leader) + goto out; + + pgrp = task_pid(p); + if (pgid != pid) { + struct task_struct *g; + + pgrp = find_vpid(pgid); + g = pid_task(pgrp, PIDTYPE_PGID); + if (!g || task_session(g) != task_session(group_leader)) + goto out; + } + + err = security_task_setpgid(p, pgid); + if (err) + goto out; + + if (task_pgrp(p) != pgrp) { + change_pid(p, PIDTYPE_PGID, pgrp); + set_task_pgrp(p, pid_nr(pgrp)); + } + + err = 0; +out: + /* All paths lead to here, thus we are safe. -DaveM */ + write_unlock_irq(&tasklist_lock); + return err; +} + +SYSCALL_DEFINE1(getpgid, pid_t, pid) +{ + struct task_struct *p; + struct pid *grp; + int retval; + + rcu_read_lock(); + if (!pid) + grp = task_pgrp(current); + else { + retval = -ESRCH; + p = find_task_by_vpid(pid); + if (!p) + goto out; + grp = task_pgrp(p); + if (!grp) + goto out; + + retval = security_task_getpgid(p); + if (retval) + goto out; + } + retval = pid_vnr(grp); +out: + rcu_read_unlock(); + return retval; +} + +#ifdef __ARCH_WANT_SYS_GETPGRP + +SYSCALL_DEFINE0(getpgrp) +{ + return sys_getpgid(0); +} + +#endif + +SYSCALL_DEFINE1(getsid, pid_t, pid) +{ + struct task_struct *p; + struct pid *sid; + int retval; + + rcu_read_lock(); + if (!pid) + sid = task_session(current); + else { + retval = -ESRCH; + p = find_task_by_vpid(pid); + if (!p) + goto out; + sid = task_session(p); + if (!sid) + goto out; + + retval = security_task_getsid(p); + if (retval) + goto out; + } + retval = pid_vnr(sid); +out: + rcu_read_unlock(); + return retval; +} + +SYSCALL_DEFINE0(setsid) +{ + struct task_struct *group_leader = current->group_leader; + struct pid *sid = task_pid(group_leader); + pid_t session = pid_vnr(sid); + int err = -EPERM; + + write_lock_irq(&tasklist_lock); + /* Fail if I am already a session leader */ + if (group_leader->signal->leader) + goto out; + + /* Fail if a process group id already exists that equals the + * proposed session id. + */ + if (pid_task(sid, PIDTYPE_PGID)) + goto out; + + group_leader->signal->leader = 1; + __set_special_pids(sid); + + proc_clear_tty(group_leader); + + err = session; +out: + write_unlock_irq(&tasklist_lock); + return err; +} + +/* + * Supplementary group IDs + */ + +/* init to 2 - one for init_task, one to ensure it is never freed */ +struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; + +struct group_info *groups_alloc(int gidsetsize) +{ + struct group_info *group_info; + int nblocks; + int i; + + nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; + /* Make sure we always allocate at least one indirect block pointer */ + nblocks = nblocks ? : 1; + group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); + if (!group_info) + return NULL; + group_info->ngroups = gidsetsize; + group_info->nblocks = nblocks; + atomic_set(&group_info->usage, 1); + + if (gidsetsize <= NGROUPS_SMALL) + group_info->blocks[0] = group_info->small_block; + else { + for (i = 0; i < nblocks; i++) { + gid_t *b; + b = (void *)__get_free_page(GFP_USER); + if (!b) + goto out_undo_partial_alloc; + group_info->blocks[i] = b; + } + } + return group_info; + +out_undo_partial_alloc: + while (--i >= 0) { + free_page((unsigned long)group_info->blocks[i]); + } + kfree(group_info); + return NULL; +} + +EXPORT_SYMBOL(groups_alloc); + +void groups_free(struct group_info *group_info) +{ + if (group_info->blocks[0] != group_info->small_block) { + int i; + for (i = 0; i < group_info->nblocks; i++) + free_page((unsigned long)group_info->blocks[i]); + } + kfree(group_info); +} + +EXPORT_SYMBOL(groups_free); + +/* export the group_info to a user-space array */ +static int groups_to_user(gid_t __user *grouplist, + const struct group_info *group_info) +{ + int i; + unsigned int count = group_info->ngroups; + + for (i = 0; i < group_info->nblocks; i++) { + unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); + unsigned int len = cp_count * sizeof(*grouplist); + + if (copy_to_user(grouplist, group_info->blocks[i], len)) + return -EFAULT; + + grouplist += NGROUPS_PER_BLOCK; + count -= cp_count; + } + return 0; +} + +/* fill a group_info from a user-space array - it must be allocated already */ +static int groups_from_user(struct group_info *group_info, + gid_t __user *grouplist) +{ + int i; + unsigned int count = group_info->ngroups; + + for (i = 0; i < group_info->nblocks; i++) { + unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); + unsigned int len = cp_count * sizeof(*grouplist); + + if (copy_from_user(group_info->blocks[i], grouplist, len)) + return -EFAULT; + + grouplist += NGROUPS_PER_BLOCK; + count -= cp_count; + } + return 0; +} + +/* a simple Shell sort */ +static void groups_sort(struct group_info *group_info) +{ + int base, max, stride; + int gidsetsize = group_info->ngroups; + + for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) + ; /* nothing */ + stride /= 3; + + while (stride) { + max = gidsetsize - stride; + for (base = 0; base < max; base++) { + int left = base; + int right = left + stride; + gid_t tmp = GROUP_AT(group_info, right); + + while (left >= 0 && GROUP_AT(group_info, left) > tmp) { + GROUP_AT(group_info, right) = + GROUP_AT(group_info, left); + right = left; + left -= stride; + } + GROUP_AT(group_info, right) = tmp; + } + stride /= 3; + } +} + +/* a simple bsearch */ +int groups_search(const struct group_info *group_info, gid_t grp) +{ + unsigned int left, right; + + if (!group_info) + return 0; + + left = 0; + right = group_info->ngroups; + while (left < right) { + unsigned int mid = (left+right)/2; + int cmp = grp - GROUP_AT(group_info, mid); + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return 1; + } + return 0; +} + +/** + * set_groups - Change a group subscription in a set of credentials + * @new: The newly prepared set of credentials to alter + * @group_info: The group list to install + * + * Validate a group subscription and, if valid, insert it into a set + * of credentials. + */ +int set_groups(struct cred *new, struct group_info *group_info) +{ + int retval; + + retval = security_task_setgroups(group_info); + if (retval) + return retval; + + put_group_info(new->group_info); + groups_sort(group_info); + get_group_info(group_info); + new->group_info = group_info; + return 0; +} + +EXPORT_SYMBOL(set_groups); + +/** + * set_current_groups - Change current's group subscription + * @group_info: The group list to impose + * + * Validate a group subscription and, if valid, impose it upon current's task + * security record. + */ +int set_current_groups(struct group_info *group_info) +{ + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = set_groups(new, group_info); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); +} + +EXPORT_SYMBOL(set_current_groups); + +SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) +{ + const struct cred *cred = current_cred(); + int i; + + if (gidsetsize < 0) + return -EINVAL; + + /* no need to grab task_lock here; it cannot change */ + i = cred->group_info->ngroups; + if (gidsetsize) { + if (i > gidsetsize) { + i = -EINVAL; + goto out; + } + if (groups_to_user(grouplist, cred->group_info)) { + i = -EFAULT; + goto out; + } + } +out: + return i; +} + +/* + * SMP: Our groups are copy-on-write. We can set them safely + * without another task interfering. + */ + +SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) +{ + struct group_info *group_info; + int retval; + + if (!capable(CAP_SETGID)) + return -EPERM; + if ((unsigned)gidsetsize > NGROUPS_MAX) + return -EINVAL; + + group_info = groups_alloc(gidsetsize); + if (!group_info) + return -ENOMEM; + retval = groups_from_user(group_info, grouplist); + if (retval) { + put_group_info(group_info); + return retval; + } + + retval = set_current_groups(group_info); + put_group_info(group_info); + + return retval; +} + +/* + * Check whether we're fsgid/egid or in the supplemental group.. + */ +int in_group_p(gid_t grp) +{ + const struct cred *cred = current_cred(); + int retval = 1; + + if (grp != cred->fsgid) + retval = groups_search(cred->group_info, grp); + return retval; +} + +EXPORT_SYMBOL(in_group_p); + +int in_egroup_p(gid_t grp) +{ + const struct cred *cred = current_cred(); + int retval = 1; + + if (grp != cred->egid) + retval = groups_search(cred->group_info, grp); + return retval; +} + +EXPORT_SYMBOL(in_egroup_p); + +DECLARE_RWSEM(uts_sem); + +SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) +{ + int errno = 0; + + down_read(&uts_sem); + if (copy_to_user(name, utsname(), sizeof *name)) + errno = -EFAULT; + up_read(&uts_sem); + return errno; +} + +SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) +{ + int errno; + char tmp[__NEW_UTS_LEN]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { + struct new_utsname *u = utsname(); + + memcpy(u->nodename, tmp, len); + memset(u->nodename + len, 0, sizeof(u->nodename) - len); + errno = 0; + } + up_write(&uts_sem); + return errno; +} + +#ifdef __ARCH_WANT_SYS_GETHOSTNAME + +SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) +{ + int i, errno; + struct new_utsname *u; + + if (len < 0) + return -EINVAL; + down_read(&uts_sem); + u = utsname(); + i = 1 + strlen(u->nodename); + if (i > len) + i = len; + errno = 0; + if (copy_to_user(name, u->nodename, i)) + errno = -EFAULT; + up_read(&uts_sem); + return errno; +} + +#endif + +/* + * Only setdomainname; getdomainname can be implemented by calling + * uname() + */ +SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) +{ + int errno; + char tmp[__NEW_UTS_LEN]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; + + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { + struct new_utsname *u = utsname(); + + memcpy(u->domainname, tmp, len); + memset(u->domainname + len, 0, sizeof(u->domainname) - len); + errno = 0; + } + up_write(&uts_sem); + return errno; +} + +SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) +{ + if (resource >= RLIM_NLIMITS) + return -EINVAL; + else { + struct rlimit value; + task_lock(current->group_leader); + value = current->signal->rlim[resource]; + task_unlock(current->group_leader); + return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; + } +} + +#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT + +/* + * Back compatibility for getrlimit. Needed for some apps. + */ + +SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct rlimit __user *, rlim) +{ + struct rlimit x; + if (resource >= RLIM_NLIMITS) + return -EINVAL; + + task_lock(current->group_leader); + x = current->signal->rlim[resource]; + task_unlock(current->group_leader); + if (x.rlim_cur > 0x7FFFFFFF) + x.rlim_cur = 0x7FFFFFFF; + if (x.rlim_max > 0x7FFFFFFF) + x.rlim_max = 0x7FFFFFFF; + return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; +} + +#endif + +SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +{ + struct rlimit new_rlim, *old_rlim; + int retval; + + if (resource >= RLIM_NLIMITS) + return -EINVAL; + if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + return -EFAULT; + if (new_rlim.rlim_cur > new_rlim.rlim_max) + return -EINVAL; + old_rlim = current->signal->rlim + resource; + if ((new_rlim.rlim_max > old_rlim->rlim_max) && + !capable(CAP_SYS_RESOURCE)) + return -EPERM; + if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) + return -EPERM; + + retval = security_task_setrlimit(resource, &new_rlim); + if (retval) + return retval; + + if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + new_rlim.rlim_cur = 1; + } + + task_lock(current->group_leader); + *old_rlim = new_rlim; + task_unlock(current->group_leader); + + if (resource != RLIMIT_CPU) + goto out; + + /* + * RLIMIT_CPU handling. Note that the kernel fails to return an error + * code if it rejected the user's attempt to set RLIMIT_CPU. This is a + * very long-standing error, and fixing it now risks breakage of + * applications, so we live with it + */ + if (new_rlim.rlim_cur == RLIM_INFINITY) + goto out; + + update_rlimit_cpu(new_rlim.rlim_cur); +out: + return 0; +} + +/* + * It would make sense to put struct rusage in the task_struct, + * except that would make the task_struct be *really big*. After + * task_struct gets moved into malloc'ed memory, it would + * make sense to do this. It will make moving the rest of the information + * a lot simpler! (Which we're not doing right now because we're not + * measuring them yet). + * + * When sampling multiple threads for RUSAGE_SELF, under SMP we might have + * races with threads incrementing their own counters. But since word + * reads are atomic, we either get new values or old values and we don't + * care which for the sums. We always take the siglock to protect reading + * the c* fields from p->signal from races with exit.c updating those + * fields when reaping, so a sample either gets all the additions of a + * given child after it's reaped, or none so this sample is before reaping. + * + * Locking: + * We need to take the siglock for CHILDEREN, SELF and BOTH + * for the cases current multithreaded, non-current single threaded + * non-current multithreaded. Thread traversal is now safe with + * the siglock held. + * Strictly speaking, we donot need to take the siglock if we are current and + * single threaded, as no one else can take our signal_struct away, no one + * else can reap the children to update signal->c* counters, and no one else + * can race with the signal-> fields. If we do not take any lock, the + * signal-> fields could be read out of order while another thread was just + * exiting. So we should place a read memory barrier when we avoid the lock. + * On the writer side, write memory barrier is implied in __exit_signal + * as __exit_signal releases the siglock spinlock after updating the signal-> + * fields. But we don't do this yet to keep things simple. + * + */ + +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) +{ + r->ru_nvcsw += t->nvcsw; + r->ru_nivcsw += t->nivcsw; + r->ru_minflt += t->min_flt; + r->ru_majflt += t->maj_flt; + r->ru_inblock += task_io_get_inblock(t); + r->ru_oublock += task_io_get_oublock(t); +} + +static void k_getrusage(struct task_struct *p, int who, struct rusage *r) +{ + struct task_struct *t; + unsigned long flags; + cputime_t utime, stime; + struct task_cputime cputime; + + memset((char *) r, 0, sizeof *r); + utime = stime = cputime_zero; + + if (who == RUSAGE_THREAD) { + utime = task_utime(current); + stime = task_stime(current); + accumulate_thread_rusage(p, r); + goto out; + } + + if (!lock_task_sighand(p, &flags)) + return; + + switch (who) { + case RUSAGE_BOTH: + case RUSAGE_CHILDREN: + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + r->ru_inblock = p->signal->cinblock; + r->ru_oublock = p->signal->coublock; + + if (who == RUSAGE_CHILDREN) + break; + + case RUSAGE_SELF: + thread_group_cputime(p, &cputime); + utime = cputime_add(utime, cputime.utime); + stime = cputime_add(stime, cputime.stime); + r->ru_nvcsw += p->signal->nvcsw; + r->ru_nivcsw += p->signal->nivcsw; + r->ru_minflt += p->signal->min_flt; + r->ru_majflt += p->signal->maj_flt; + r->ru_inblock += p->signal->inblock; + r->ru_oublock += p->signal->oublock; + t = p; + do { + accumulate_thread_rusage(t, r); + t = next_thread(t); + } while (t != p); + break; + + default: + BUG(); + } + unlock_task_sighand(p, &flags); + +out: + cputime_to_timeval(utime, &r->ru_utime); + cputime_to_timeval(stime, &r->ru_stime); +} + +int getrusage(struct task_struct *p, int who, struct rusage __user *ru) +{ + struct rusage r; + k_getrusage(p, who, &r); + return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; +} + +SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) +{ + if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && + who != RUSAGE_THREAD) + return -EINVAL; + return getrusage(current, who, ru); +} + +SYSCALL_DEFINE1(umask, int, mask) +{ + mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); + return mask; +} + +SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, + unsigned long, arg4, unsigned long, arg5) +{ + struct task_struct *me = current; + unsigned char comm[sizeof(me->comm)]; + long error; + + error = security_task_prctl(option, arg2, arg3, arg4, arg5); + if (error != -ENOSYS) + return error; + + error = 0; + switch (option) { + case PR_SET_PDEATHSIG: + if (!valid_signal(arg2)) { + error = -EINVAL; + break; + } + me->pdeath_signal = arg2; + error = 0; + break; + case PR_GET_PDEATHSIG: + error = put_user(me->pdeath_signal, (int __user *)arg2); + break; + case PR_GET_DUMPABLE: + error = get_dumpable(me->mm); + break; + case PR_SET_DUMPABLE: + if (arg2 < 0 || arg2 > 1) { + error = -EINVAL; + break; + } + set_dumpable(me->mm, arg2); + error = 0; + break; + + case PR_SET_UNALIGN: + error = SET_UNALIGN_CTL(me, arg2); + break; + case PR_GET_UNALIGN: + error = GET_UNALIGN_CTL(me, arg2); + break; + case PR_SET_FPEMU: + error = SET_FPEMU_CTL(me, arg2); + break; + case PR_GET_FPEMU: + error = GET_FPEMU_CTL(me, arg2); + break; + case PR_SET_FPEXC: + error = SET_FPEXC_CTL(me, arg2); + break; + case PR_GET_FPEXC: + error = GET_FPEXC_CTL(me, arg2); + break; + case PR_GET_TIMING: + error = PR_TIMING_STATISTICAL; + break; + case PR_SET_TIMING: + if (arg2 != PR_TIMING_STATISTICAL) + error = -EINVAL; + else + error = 0; + break; + + case PR_SET_NAME: + comm[sizeof(me->comm)-1] = 0; + if (strncpy_from_user(comm, (char __user *)arg2, + sizeof(me->comm) - 1) < 0) + return -EFAULT; + set_task_comm(me, comm); + return 0; + case PR_GET_NAME: + get_task_comm(comm, me); + if (copy_to_user((char __user *)arg2, comm, + sizeof(comm))) + return -EFAULT; + return 0; + case PR_GET_ENDIAN: + error = GET_ENDIAN(me, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(me, arg2); + break; + + case PR_GET_SECCOMP: + error = prctl_get_seccomp(); + break; + case PR_SET_SECCOMP: + error = prctl_set_seccomp(arg2); + break; + case PR_GET_TSC: + error = GET_TSC_CTL(arg2); + break; + case PR_SET_TSC: + error = SET_TSC_CTL(arg2); + break; + case PR_GET_TIMERSLACK: + error = current->timer_slack_ns; + break; + case PR_SET_TIMERSLACK: + if (arg2 <= 0) + current->timer_slack_ns = + current->default_timer_slack_ns; + else + current->timer_slack_ns = arg2; + error = 0; + break; + default: + error = -EINVAL; + break; + } + return error; +} + +SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, + struct getcpu_cache __user *, unused) +{ + int err = 0; + int cpu = raw_smp_processor_id(); + if (cpup) + err |= put_user(cpu, cpup); + if (nodep) + err |= put_user(cpu_to_node(cpu), nodep); + return err ? -EFAULT : 0; +} + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static void argv_cleanup(char **argv, char **envp) +{ + argv_free(argv); +} + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + int argc; + char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); + static char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + int ret = -ENOMEM; + struct subprocess_info *info; + + if (argv == NULL) { + printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", + __func__, poweroff_cmd); + goto out; + } + + info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); + if (info == NULL) { + argv_free(argv); + goto out; + } + + call_usermodehelper_setcleanup(info, argv_cleanup); + + ret = call_usermodehelper_exec(info, UMH_NO_WAIT); + + out: + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + + /* I guess this should try to kick off some daemon to + sync and poweroff asap. Or not even bother syncing + if we're doing an emergency shutdown? */ + emergency_sync(); + kernel_power_off(); + } + + return ret; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); +#endif /* DDE_LINUX */ diff --git a/libdde-linux26/lib/src/kernel/time.c b/libdde-linux26/lib/src/kernel/time.c new file mode 100644 index 00000000..ce5b5fd4 --- /dev/null +++ b/libdde-linux26/lib/src/kernel/time.c @@ -0,0 +1,765 @@ +/* + * linux/kernel/time.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file contains the interface functions for the various + * time related system calls: time, stime, gettimeofday, settimeofday, + * adjtime + */ +/* + * Modification history kernel/time.c + * + * 1993-09-02 Philip Gladstone + * Created file with time related functions from sched.c and adjtimex() + * 1993-10-08 Torsten Duwe + * adjtime interface update and CMOS clock write code + * 1995-08-13 Torsten Duwe + * kernel PLL updated to 1994-12-13 specs (rfc-1589) + * 1999-01-16 Ulrich Windl + * Introduced error checking for many cases in adjtimex(). + * Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) + * (Even though the technical memorandum forbids it) + * 2004-07-14 Christoph Lameter + * Added getnstimeofday to allow the posix timer functions to return + * with nanosecond accuracy + */ + +#include <linux/module.h> +#include <linux/timex.h> +#include <linux/capability.h> +#include <linux/clocksource.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <linux/security.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/math64.h> +#include <linux/ptrace.h> + +#include <asm/uaccess.h> +#include <asm/unistd.h> + +#include "timeconst.h" +#include <ddekit/timer.h> + +/* + * The timezone where the local system is located. Used as a default by some + * programs who obtain this value by using gettimeofday. + */ +struct timezone sys_tz; + +EXPORT_SYMBOL(sys_tz); + +#ifdef __ARCH_WANT_SYS_TIME + +/* + * sys_time() can be implemented in user-level using + * sys_gettimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ +SYSCALL_DEFINE1(time, time_t __user *, tloc) +{ + time_t i = get_seconds(); + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +/* + * sys_stime() can be implemented in user-level using + * sys_settimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ + +SYSCALL_DEFINE1(stime, time_t __user *, tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_SYS_TIME */ + +SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + if (likely(tv != NULL)) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (copy_to_user(tv, &ktv, sizeof(ktv))) + return -EFAULT; + } + if (unlikely(tz != NULL)) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + return 0; +} + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +static inline void warp_clock(void) +{ + write_seqlock_irq(&xtime_lock); + wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; + xtime.tv_sec += sys_tz.tz_minuteswest * 60; + update_xtime_cache(0); + write_sequnlock_irq(&xtime_lock); + clock_was_set(); +} + +/* + * In case for some reason the CMOS clock has not already been running + * in UTC, but in some local time: The first time we set the timezone, + * we will warp the clock so that it is ticking UTC time instead of + * local time. Presumably, if someone is setting the timezone then we + * are running in an environment where the programs understand about + * timezones. This should be done at boot time in the /etc/rc script, + * as soon as possible, so that the clock can be set right. Otherwise, + * various programs will get confused when the clock gets warped. + */ + +int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) +{ + static int firsttime = 1; + int error = 0; + + if (tv && !timespec_valid(tv)) + return -EINVAL; + + error = security_settime(tv, tz); + if (error) + return error; + + if (tz) { + /* SMP safe, global irq locking makes it work. */ + sys_tz = *tz; + update_vsyscall_tz(); + if (firsttime) { + firsttime = 0; + if (!tv) + warp_clock(); + } + } + if (tv) + { + /* SMP safe, again the code in arch/foo/time.c should + * globally block out interrupts when it runs. + */ + return do_settimeofday(tv); + } + return 0; +} + +SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + struct timeval user_tv; + struct timespec new_ts; + struct timezone new_tz; + + if (tv) { + if (copy_from_user(&user_tv, tv, sizeof(*tv))) + return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} + +SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) +{ + struct timex txc; /* Local copy of parameter */ + int ret; + + /* Copy the user data space into the kernel copy + * structure. But bear in mind that the structures + * may change + */ + if(copy_from_user(&txc, txc_p, sizeof(struct timex))) + return -EFAULT; + ret = do_adjtimex(&txc); + return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; +} + +#ifndef DDE_LINUX +/** + * current_fs_time - Return FS time + * @sb: Superblock. + * + * Return the current time truncated to the time granularity supported by + * the fs. + */ +struct timespec current_fs_time(struct super_block *sb) +{ + struct timespec now = current_kernel_time(); + return timespec_trunc(now, sb->s_time_gran); +} +EXPORT_SYMBOL(current_fs_time); + +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int inline jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; +# else + return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int inline jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; +# else + return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); +#endif + +/** + * timespec_trunc - Truncate timespec to a granularity + * @t: Timespec + * @gran: Granularity in ns. + * + * Truncate a timespec to a granularity. gran must be smaller than a second. + * Always rounds down. + * + * This function should be only used for timestamps returned by + * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because + * it doesn't handle the better resolution of the latter. + */ +struct timespec timespec_trunc(struct timespec t, unsigned gran) +{ + /* + * Division is pretty slow so avoid it for common cases. + * Currently current_kernel_time() never returns better than + * jiffies resolution. Exploit that. + */ + if (gran <= jiffies_to_usecs(1) * 1000) { + /* nothing */ + } else if (gran == 1000000000) { + t.tv_nsec = 0; + } else { + t.tv_nsec -= t.tv_nsec % gran; + } + return t; +} +EXPORT_SYMBOL(timespec_trunc); + +#ifndef CONFIG_GENERIC_TIME +/* + * Simulate gettimeofday using do_gettimeofday which only allows a timeval + * and therefore only yields usec accuracy + */ +void getnstimeofday(struct timespec *tv) +{ + struct timeval x; + + do_gettimeofday(&x); + tv->tv_sec = x.tv_sec; + tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; +} +EXPORT_SYMBOL_GPL(getnstimeofday); +#endif + +/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. + * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 + * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. + * + * [For the Julian calendar (which was used in Russia before 1917, + * Britain & colonies before 1752, anywhere else before 1582, + * and is still in use by some communities) leave out the + * -year/100+year/400 terms, and add 10.] + * + * This algorithm was first published by Gauss (I think). + * + * WARNING: this function will overflow on 2106-02-07 06:28:16 on + * machines where long is 32-bit! (However, as time_t is signed, we + * will already get problems at other places on 2038-01-19 03:14:08) + */ +unsigned long +mktime(const unsigned int year0, const unsigned int mon0, + const unsigned int day, const unsigned int hour, + const unsigned int min, const unsigned int sec) +{ + unsigned int mon = mon0, year = year0; + + /* 1..12 -> 11,12,1..10 */ + if (0 >= (int) (mon -= 2)) { + mon += 12; /* Puts Feb last since it has leap day */ + year -= 1; + } + + return ((((unsigned long) + (year/4 - year/100 + year/400 + 367*mon/12 + day) + + year*365 - 719499 + )*24 + hour /* now have hours */ + )*60 + min /* now have minutes */ + )*60 + sec; /* finally seconds */ +} + +EXPORT_SYMBOL(mktime); + +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) +{ + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec); + +/** + * ns_to_timespec - Convert nanoseconds to timespec + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec representation of the nsec parameter. + */ +struct timespec ns_to_timespec(const s64 nsec) +{ + struct timespec ts; + s32 rem; + + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec); + +/** + * ns_to_timeval - Convert nanoseconds to timeval + * @nsec: the nanoseconds value to be converted + * + * Returns the timeval representation of the nsec parameter. + */ +struct timeval ns_to_timeval(const s64 nsec) +{ + struct timespec ts = ns_to_timespec(nsec); + struct timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_timeval); + +#ifndef DDE_LINUX +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else + return (j * MSEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else + return (j * USEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + +/* + * When we convert to jiffies then we interpret incoming values + * the following way: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor + * + * We must also be careful about 32-bit overflows. + */ +unsigned long msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + /* + * HZ is equal to or smaller than 1000, and 1000 is a nice + * round multiple of HZ, divide with the factor between them, + * but round upwards: + */ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + /* + * HZ is larger than 1000, and HZ is a nice round multiple of + * 1000 - simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot + * overflow: + */ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return m * (HZ / MSEC_PER_SEC); +#else + /* + * Generic case - multiply, round and divide. But first + * check that if we are doing a net multiplication, that + * we wouldn't overflow: + */ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) + >> MSEC_TO_HZ_SHR32; +#endif +} +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return u * (HZ / USEC_PER_SEC); +#else + return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) + >> USEC_TO_HZ_SHR32; +#endif +} +EXPORT_SYMBOL(usecs_to_jiffies); +#else /* DDE_LINUX */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ + return (j*1000) / HZ; +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ + return (j*1000000) / HZ; +} +EXPORT_SYMBOL(jiffies_to_usecs); + +unsigned long msecs_to_jiffies(const unsigned int m) +{ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return (m * HZ + MSEC_PER_SEC) / MSEC_PER_SEC; +} +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; +} +EXPORT_SYMBOL(usecs_to_jiffies); +#endif + +/* + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + */ +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + unsigned long sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} +EXPORT_SYMBOL(timespec_to_jiffies); + +void +jiffies_to_timespec(const unsigned long jiffiesv, struct timespec *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + value->tv_sec = div_u64_rem((u64)jiffiesv * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_nsec = rem; +} +EXPORT_SYMBOL(jiffies_to_timespec); + +/* Same for "timeval" + * + * Well, almost. The problem here is that the real system resolution is + * in nanoseconds and the value being converted is in micro seconds. + * Also for some machines (those that use HZ = 1024, in-particular), + * there is a LARGE error in the tick size in microseconds. + + * The solution we use is to do the rounding AFTER we convert the + * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. + * Instruction wise, this should cost only an additional add with carry + * instruction above the way it was done above. + */ +unsigned long +timeval_to_jiffies(const struct timeval *value) +{ + unsigned long sec = value->tv_sec; + long usec = value->tv_usec; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + usec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> + (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; +} +EXPORT_SYMBOL(timeval_to_jiffies); + +void jiffies_to_timeval(const unsigned long jiffiesv, struct timeval *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + + value->tv_sec = div_u64_rem((u64)jiffiesv * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_usec = rem / NSEC_PER_USEC; +} +EXPORT_SYMBOL(jiffies_to_timeval); + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ +clock_t jiffies_to_clock_t(long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + return x * (USER_HZ / HZ); +# else + return x / (HZ / USER_HZ); +# endif +#else + return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); +#endif +} +EXPORT_SYMBOL(jiffies_to_clock_t); + +#ifndef DDE_LINUX +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + return div_u64((u64)x * HZ, USER_HZ); +#endif +} +#else +unsigned long clock_t_to_jiffies(unsigned long x) +{ + assert (HZ); + assert (USER_HZ); + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +} +#endif /* DDE_LINUX */ +EXPORT_SYMBOL(clock_t_to_jiffies); + +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + x = div_u64(x * USER_HZ, HZ); +# elif HZ > USER_HZ + x = div_u64(x, HZ / USER_HZ); +# else + /* Nothing to do */ +# endif +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} +EXPORT_SYMBOL(jiffies_64_to_clock_t); + +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + return div_u64(x, NSEC_PER_SEC / USER_HZ); +#elif (USER_HZ % 512) == 0 + return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); +#endif +} + +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&xtime_lock); + ret = jiffies_64; + } while (read_seqretry(&xtime_lock, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + +/* + * Add two timespec values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0) + */ +struct timespec timespec_add_safe(const struct timespec lhs, + const struct timespec rhs) +{ + struct timespec res; + + set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) + res.tv_sec = TIME_T_MAX; + + return res; +} diff --git a/libdde-linux26/lib/src/kernel/timeconst.pl b/libdde-linux26/lib/src/kernel/timeconst.pl new file mode 100755 index 00000000..d459895f --- /dev/null +++ b/libdde-linux26/lib/src/kernel/timeconst.pl @@ -0,0 +1,378 @@ +#!/usr/bin/perl +# ----------------------------------------------------------------------- +# +# Copyright 2007-2008 rPath, Inc. - All Rights Reserved +# +# This file is part of the Linux kernel, and is made available under +# the terms of the GNU General Public License version 2 or (at your +# option) any later version; incorporated herein by reference. +# +# ----------------------------------------------------------------------- +# + +# +# Usage: timeconst.pl HZ > timeconst.h +# + +# Precomputed values for systems without Math::BigInt +# Generated by: +# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200 +%canned_values = ( + 24 => [ + '0xa6aaaaab','0x2aaaaaa',26, + 125,3, + '0xc49ba5e4','0x1fbe76c8b4',37, + 3,125, + '0xa2c2aaab','0xaaaa',16, + 125000,3, + '0xc9539b89','0x7fffbce4217d',47, + 3,125000, + ], 32 => [ + '0xfa000000','0x6000000',27, + 125,4, + '0x83126e98','0xfdf3b645a',36, + 4,125, + '0xf4240000','0x0',17, + 31250,1, + '0x8637bd06','0x3fff79c842fa',46, + 1,31250, + ], 48 => [ + '0xa6aaaaab','0x6aaaaaa',27, + 125,6, + '0xc49ba5e4','0xfdf3b645a',36, + 6,125, + '0xa2c2aaab','0x15555',17, + 62500,3, + '0xc9539b89','0x3fffbce4217d',46, + 3,62500, + ], 64 => [ + '0xfa000000','0xe000000',28, + 125,8, + '0x83126e98','0x7ef9db22d',35, + 8,125, + '0xf4240000','0x0',18, + 15625,1, + '0x8637bd06','0x1fff79c842fa',45, + 1,15625, + ], 100 => [ + '0xa0000000','0x0',28, + 10,1, + '0xcccccccd','0x733333333',35, + 1,10, + '0x9c400000','0x0',18, + 10000,1, + '0xd1b71759','0x1fff2e48e8a7',45, + 1,10000, + ], 122 => [ + '0x8325c53f','0xfbcda3a',28, + 500,61, + '0xf9db22d1','0x7fbe76c8b',35, + 61,500, + '0x8012e2a0','0x3ef36',18, + 500000,61, + '0xffda4053','0x1ffffbce4217',45, + 61,500000, + ], 128 => [ + '0xfa000000','0x1e000000',29, + 125,16, + '0x83126e98','0x3f7ced916',34, + 16,125, + '0xf4240000','0x40000',19, + 15625,2, + '0x8637bd06','0xfffbce4217d',44, + 2,15625, + ], 200 => [ + '0xa0000000','0x0',29, + 5,1, + '0xcccccccd','0x333333333',34, + 1,5, + '0x9c400000','0x0',19, + 5000,1, + '0xd1b71759','0xfff2e48e8a7',44, + 1,5000, + ], 250 => [ + '0x80000000','0x0',29, + 4,1, + '0x80000000','0x180000000',33, + 1,4, + '0xfa000000','0x0',20, + 4000,1, + '0x83126e98','0x7ff7ced9168',43, + 1,4000, + ], 256 => [ + '0xfa000000','0x3e000000',30, + 125,32, + '0x83126e98','0x1fbe76c8b',33, + 32,125, + '0xf4240000','0xc0000',20, + 15625,4, + '0x8637bd06','0x7ffde7210be',43, + 4,15625, + ], 300 => [ + '0xd5555556','0x2aaaaaaa',30, + 10,3, + '0x9999999a','0x1cccccccc',33, + 3,10, + '0xd0555556','0xaaaaa',20, + 10000,3, + '0x9d495183','0x7ffcb923a29',43, + 3,10000, + ], 512 => [ + '0xfa000000','0x7e000000',31, + 125,64, + '0x83126e98','0xfdf3b645',32, + 64,125, + '0xf4240000','0x1c0000',21, + 15625,8, + '0x8637bd06','0x3ffef39085f',42, + 8,15625, + ], 1000 => [ + '0x80000000','0x0',31, + 1,1, + '0x80000000','0x0',31, + 1,1, + '0xfa000000','0x0',22, + 1000,1, + '0x83126e98','0x1ff7ced9168',41, + 1,1000, + ], 1024 => [ + '0xfa000000','0xfe000000',32, + 125,128, + '0x83126e98','0x7ef9db22',31, + 128,125, + '0xf4240000','0x3c0000',22, + 15625,16, + '0x8637bd06','0x1fff79c842f',41, + 16,15625, + ], 1200 => [ + '0xd5555556','0xd5555555',32, + 5,6, + '0x9999999a','0x66666666',31, + 6,5, + '0xd0555556','0x2aaaaa',22, + 2500,3, + '0x9d495183','0x1ffcb923a29',41, + 3,2500, + ] +); + +$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;'; + +sub bint($) +{ + my($x) = @_; + return Math::BigInt->new($x); +} + +# +# Constants for division by reciprocal multiplication. +# (bits, numerator, denominator) +# +sub fmul($$$) +{ + my ($b,$n,$d) = @_; + + $n = bint($n); + $d = bint($d); + + return scalar (($n << $b)+$d-bint(1))/$d; +} + +sub fadj($$$) +{ + my($b,$n,$d) = @_; + + $n = bint($n); + $d = bint($d); + + $d = $d/bgcd($n, $d); + return scalar (($d-bint(1)) << $b)/$d; +} + +sub fmuls($$$) { + my($b,$n,$d) = @_; + my($s,$m); + my($thres) = bint(1) << ($b-1); + + $n = bint($n); + $d = bint($d); + + for ($s = 0; 1; $s++) { + $m = fmul($s,$n,$d); + return $s if ($m >= $thres); + } + return 0; +} + +# Generate a hex value if the result fits in 64 bits; +# otherwise skip. +sub bignum_hex($) { + my($x) = @_; + my $s = $x->as_hex(); + + return (length($s) > 18) ? undef : $s; +} + +# Provides mul, adj, and shr factors for a specific +# (bit, time, hz) combination +sub muladj($$$) { + my($b, $t, $hz) = @_; + my $s = fmuls($b, $t, $hz); + my $m = fmul($s, $t, $hz); + my $a = fadj($s, $t, $hz); + return (bignum_hex($m), bignum_hex($a), $s); +} + +# Provides numerator, denominator values +sub numden($$) { + my($n, $d) = @_; + my $g = bgcd($n, $d); + return ($n/$g, $d/$g); +} + +# All values for a specific (time, hz) combo +sub conversions($$) { + my ($t, $hz) = @_; + my @val = (); + + # HZ_TO_xx + push(@val, muladj(32, $t, $hz)); + push(@val, numden($t, $hz)); + + # xx_TO_HZ + push(@val, muladj(32, $hz, $t)); + push(@val, numden($hz, $t)); + + return @val; +} + +sub compute_values($) { + my($hz) = @_; + my @val = (); + my $s, $m, $a, $g; + + if (!$has_bigint) { + die "$0: HZ == $hz not canned and ". + "Math::BigInt not available\n"; + } + + # MSEC conversions + push(@val, conversions(1000, $hz)); + + # USEC conversions + push(@val, conversions(1000000, $hz)); + + return @val; +} + +sub outputval($$) +{ + my($name, $val) = @_; + my $csuf; + + if (defined($val)) { + if ($name !~ /SHR/) { + $val = "U64_C($val)"; + } + printf "#define %-23s %s\n", $name.$csuf, $val.$csuf; + } +} + +sub output($@) +{ + my($hz, @val) = @_; + my $pfx, $bit, $suf, $s, $m, $a; + + print "/* Automatically generated by kernel/timeconst.pl */\n"; + print "/* Conversion constants for HZ == $hz */\n"; + print "\n"; + print "#ifndef KERNEL_TIMECONST_H\n"; + print "#define KERNEL_TIMECONST_H\n"; + print "\n"; + + print "#include <linux/param.h>\n"; + print "#include <linux/types.h>\n"; + + print "\n"; + print "#if HZ != $hz && !defined(DDE_LINUX)\n"; + print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n"; + print "#endif\n"; + print "\n"; + + foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', + 'HZ_TO_USEC','USEC_TO_HZ') { + foreach $bit (32) { + foreach $suf ('MUL', 'ADJ', 'SHR') { + outputval("${pfx}_$suf$bit", shift(@val)); + } + } + foreach $suf ('NUM', 'DEN') { + outputval("${pfx}_$suf", shift(@val)); + } + } + + print "\n"; + print "#endif /* KERNEL_TIMECONST_H */\n"; +} + +# Pretty-print Perl values +sub perlvals(@) { + my $v; + my @l = (); + + foreach $v (@_) { + if (!defined($v)) { + push(@l, 'undef'); + } elsif ($v =~ /^0x/) { + push(@l, "\'".$v."\'"); + } else { + push(@l, $v.''); + } + } + return join(',', @l); +} + +($hz) = @ARGV; + +# Use this to generate the %canned_values structure +if ($hz eq '--can') { + shift(@ARGV); + @hzlist = sort {$a <=> $b} (@ARGV); + + print "# Precomputed values for systems without Math::BigInt\n"; + print "# Generated by:\n"; + print "# timeconst.pl --can ", join(' ', @hzlist), "\n"; + print "\%canned_values = (\n"; + my $pf = "\t"; + foreach $hz (@hzlist) { + my @values = compute_values($hz); + print "$pf$hz => [\n"; + while (scalar(@values)) { + my $bit; + foreach $bit (32) { + my $m = shift(@values); + my $a = shift(@values); + my $s = shift(@values); + print "\t\t", perlvals($m,$a,$s), ",\n"; + } + my $n = shift(@values); + my $d = shift(@values); + print "\t\t", perlvals($n,$d), ",\n"; + } + print "\t]"; + $pf = ', '; + } + print "\n);\n"; +} else { + $hz += 0; # Force to number + if ($hz < 1) { + die "Usage: $0 HZ\n"; + } + + @val = @{$canned_values{$hz}}; + if (!defined(@val)) { + @val = compute_values($hz); + } + output($hz, @val); +} +exit 0; diff --git a/libdde-linux26/lib/src/kernel/timer.c b/libdde-linux26/lib/src/kernel/timer.c new file mode 100644 index 00000000..951d6ffc --- /dev/null +++ b/libdde-linux26/lib/src/kernel/timer.c @@ -0,0 +1,1590 @@ +/* + * linux/kernel/timer.c + * + * Kernel internal timers, basic process system calls + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1999-03-10 Improved NTP compatibility by Ulrich Windl + * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love + * 2000-10-05 Implemented scalable SMP per-CPU timer handling. + * Copyright (C) 2000, 2001, 2002 Ingo Molnar + * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar + */ + +#include <linux/kernel_stat.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pid_namespace.h> +#include <linux/notifier.h> +#include <linux/thread_info.h> +#include <linux/time.h> +#include <linux/jiffies.h> +#include <linux/posix-timers.h> +#include <linux/cpu.h> +#include <linux/syscalls.h> +#include <linux/delay.h> +#include <linux/tick.h> +#include <linux/kallsyms.h> + +#include <asm/uaccess.h> +#include <asm/unistd.h> +#include <asm/div64.h> +#include <asm/timex.h> +#include <asm/io.h> + +#include <ddekit/timer.h> + +#ifndef DDE_LINUX + +u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + +/* + * per-CPU timer vector definitions: + */ +#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) +#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) + +struct tvec { + struct list_head vec[TVN_SIZE]; +}; + +struct tvec_root { + struct list_head vec[TVR_SIZE]; +}; + +struct tvec_base { + spinlock_t lock; + struct timer_list *running_timer; + unsigned long timer_jiffies; + struct tvec_root tv1; + struct tvec tv2; + struct tvec tv3; + struct tvec tv4; + struct tvec tv5; +} ____cacheline_aligned; + +struct tvec_base boot_tvec_bases; +EXPORT_SYMBOL(boot_tvec_bases); +static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; + +/* + * Note that all tvec_bases are 2 byte aligned and lower bit of + * base in timer_list is guaranteed to be zero. Use the LSB for + * the new flag to indicate whether the timer is deferrable + */ +#define TBASE_DEFERRABLE_FLAG (0x1) + +/* Functions below help us manage 'deferrable' flag */ +static inline unsigned int tbase_get_deferrable(struct tvec_base *base) +{ + return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); +} + +static inline struct tvec_base *tbase_get_base(struct tvec_base *base) +{ + return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); +} + +static inline void timer_set_deferrable(struct timer_list *timer) +{ + timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | + TBASE_DEFERRABLE_FLAG)); +} + +static inline void +timer_set_base(struct timer_list *timer, struct tvec_base *new_base) +{ + timer->base = (struct tvec_base *)((unsigned long)(new_base) | + tbase_get_deferrable(timer->base)); +} +#endif /* DDE_LINUX */ + +static unsigned long round_jiffies_common(unsigned long j, int cpu, + bool force_up) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + * But never round down if @force_up is set. + */ + if (rem < HZ/4 && !force_up) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + if (j <= jiffies) /* rounding ate our timeout entirely; */ + return original; + return j; +} + +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, false); +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, false) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), false); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + +/** + * __round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, true); +} +EXPORT_SYMBOL_GPL(__round_jiffies_up); + +/** + * __round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, true) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); + +/** + * round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * This is the same as round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), true); +} +EXPORT_SYMBOL_GPL(round_jiffies_up); + +/** + * round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * This is the same as round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up_relative(unsigned long j) +{ + return __round_jiffies_up_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_up_relative); + + +#ifndef DDE_LINUX +static inline void set_running_timer(struct tvec_base *base, + struct timer_list *timer) +{ +#ifdef CONFIG_SMP + base->running_timer = timer; +#endif +} + +static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +{ + unsigned long expires = timer->expires; + unsigned long idx = expires - base->timer_jiffies; + struct list_head *vec; + + if (idx < TVR_SIZE) { + int i = expires & TVR_MASK; + vec = base->tv1.vec + i; + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { + int i = (expires >> TVR_BITS) & TVN_MASK; + vec = base->tv2.vec + i; + } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; + vec = base->tv3.vec + i; + } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; + vec = base->tv4.vec + i; + } else if ((signed long) idx < 0) { + /* + * Can happen if you add a timer with expires == jiffies, + * or you set a timer to go off in the past + */ + vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); + } else { + int i; + /* If the timeout is larger than 0xffffffff on 64-bit + * architectures then we use the maximum timeout: + */ + if (idx > 0xffffffffUL) { + idx = 0xffffffffUL; + expires = idx + base->timer_jiffies; + } + i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; + vec = base->tv5.vec + i; + } + /* + * Timers are FIFO: + */ + list_add_tail(&timer->entry, vec); +} + +#ifdef CONFIG_TIMER_STATS +void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} + +static void timer_stats_account_timer(struct timer_list *timer) +{ + unsigned int flag = 0; + + if (unlikely(tbase_get_deferrable(timer->base))) + flag |= TIMER_STATS_FLAG_DEFERRABLE; + + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, flag); +} + +#else +static void timer_stats_account_timer(struct timer_list *timer) {} +#endif + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr timer_debug_descr; + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int timer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_init(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int timer_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The timer was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (timer->entry.next == NULL && + timer->entry.prev == TIMER_ENTRY_STATIC) { + debug_object_init(timer, &timer_debug_descr); + debug_object_activate(timer, &timer_debug_descr); + return 0; + } else { + WARN_ON_ONCE(1); + } + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int timer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_free(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr timer_debug_descr = { + .name = "timer_list", + .fixup_init = timer_fixup_init, + .fixup_activate = timer_fixup_activate, + .fixup_free = timer_fixup_free, +}; + +static inline void debug_timer_init(struct timer_list *timer) +{ + debug_object_init(timer, &timer_debug_descr); +} + +static inline void debug_timer_activate(struct timer_list *timer) +{ + debug_object_activate(timer, &timer_debug_descr); +} + +static inline void debug_timer_deactivate(struct timer_list *timer) +{ + debug_object_deactivate(timer, &timer_debug_descr); +} + +static inline void debug_timer_free(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} + +static void __init_timer(struct timer_list *timer); + +void init_timer_on_stack(struct timer_list *timer) +{ + debug_object_init_on_stack(timer, &timer_debug_descr); + __init_timer(timer); +} +EXPORT_SYMBOL_GPL(init_timer_on_stack); + +void destroy_timer_on_stack(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_timer_on_stack); + +#else +static inline void debug_timer_init(struct timer_list *timer) { } +static inline void debug_timer_activate(struct timer_list *timer) { } +static inline void debug_timer_deactivate(struct timer_list *timer) { } +#endif + +static void __init_timer(struct timer_list *timer) +{ + timer->entry.next = NULL; + timer->base = __raw_get_cpu_var(tvec_bases); +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif +} + +/** + * init_timer - initialize a timer. + * @timer: the timer to be initialized + * + * init_timer() must be done to a timer prior calling *any* of the + * other timer functions. + */ +void init_timer(struct timer_list *timer) +{ + debug_timer_init(timer); + __init_timer(timer); +} +EXPORT_SYMBOL(init_timer); + +void init_timer_deferrable(struct timer_list *timer) +{ + init_timer(timer); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL(init_timer_deferrable); + +static inline void detach_timer(struct timer_list *timer, + int clear_pending) +{ + struct list_head *entry = &timer->entry; + + debug_timer_deactivate(timer); + + __list_del(entry->prev, entry->next); + if (clear_pending) + entry->next = NULL; + entry->prev = LIST_POISON2; +} + +/* + * We are using hashed locking: holding per_cpu(tvec_bases).lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on ->tvX lists. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = NULL and drop the lock: the timer remains + * locked. + */ +static struct tvec_base *lock_timer_base(struct timer_list *timer, + unsigned long *flags) + __acquires(timer->base->lock) +{ + struct tvec_base *base; + + for (;;) { + struct tvec_base *prelock_base = timer->base; + base = tbase_get_base(prelock_base); + if (likely(base != NULL)) { + spin_lock_irqsave(&base->lock, *flags); + if (likely(prelock_base == timer->base)) + return base; + /* The timer has migrated to another CPU */ + spin_unlock_irqrestore(&base->lock, *flags); + } + cpu_relax(); + } +} + +int __mod_timer(struct timer_list *timer, unsigned long expires) +{ + struct tvec_base *base, *new_base; + unsigned long flags; + int ret = 0; + + timer_stats_timer_set_start_info(timer); + BUG_ON(!timer->function); + + base = lock_timer_base(timer, &flags); + + if (timer_pending(timer)) { + detach_timer(timer, 0); + ret = 1; + } + + debug_timer_activate(timer); + + new_base = __get_cpu_var(tvec_bases); + + if (base != new_base) { + /* + * We are trying to schedule the timer on the local CPU. + * However we can't change timer's base while it is running, + * otherwise del_timer_sync() can't detect that the timer's + * handler yet has not finished. This also guarantees that + * the timer is serialized wrt itself. + */ + if (likely(base->running_timer != timer)) { + /* See the comment in lock_timer_base() */ + timer_set_base(timer, NULL); + spin_unlock(&base->lock); + base = new_base; + spin_lock(&base->lock); + timer_set_base(timer, base); + } + } + + timer->expires = expires; + internal_add_timer(base, timer); + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + +EXPORT_SYMBOL(__mod_timer); + +/** + * add_timer_on - start a timer on a particular CPU + * @timer: the timer to be added + * @cpu: the CPU to start it on + * + * This is not very scalable on SMP. Double adds are not possible. + */ +void add_timer_on(struct timer_list *timer, int cpu) +{ + struct tvec_base *base = per_cpu(tvec_bases, cpu); + unsigned long flags; + + timer_stats_timer_set_start_info(timer); + BUG_ON(timer_pending(timer) || !timer->function); + spin_lock_irqsave(&base->lock, flags); + timer_set_base(timer, base); + debug_timer_activate(timer); + internal_add_timer(base, timer); + /* + * Check whether the other CPU is idle and needs to be + * triggered to reevaluate the timer wheel when nohz is + * active. We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to idle can not evaluate + * the timer wheel. + */ + wake_up_idle_cpu(cpu); + spin_unlock_irqrestore(&base->lock, flags); +} + +/** + * mod_timer - modify a timer's timeout + * @timer: the timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer() is a more efficient way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * + * mod_timer(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + * + * Note that if there are multiple unserialized concurrent users of the + * same timer, then mod_timer() is the only safe way to modify the timeout, + * since add_timer() cannot modify an already running timer. + * + * The function returns whether it has modified a pending timer or not. + * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an + * active timer returns 1.) + */ +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + BUG_ON(!timer->function); + + timer_stats_timer_set_start_info(timer); + /* + * This is a common optimization triggered by the + * networking code - if the timer is re-modified + * to be the same thing then just return: + */ + if (timer->expires == expires && timer_pending(timer)) + return 1; + + return __mod_timer(timer, expires); +} + +EXPORT_SYMBOL(mod_timer); + +/** + * del_timer - deactive a timer. + * @timer: the timer to be deactivated + * + * del_timer() deactivates a timer - this works on both active and inactive + * timers. + * + * The function returns whether it has deactivated a pending timer or not. + * (ie. del_timer() of an inactive timer returns 0, del_timer() of an + * active timer returns 1.) + */ +int del_timer(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = 0; + + timer_stats_timer_clear_start_info(timer); + if (timer_pending(timer)) { + base = lock_timer_base(timer, &flags); + if (timer_pending(timer)) { + detach_timer(timer, 1); + ret = 1; + } + spin_unlock_irqrestore(&base->lock, flags); + } + + return ret; +} + +EXPORT_SYMBOL(del_timer); + +#ifdef CONFIG_SMP +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: timer do del + * + * This function tries to deactivate a timer. Upon successful (ret >= 0) + * exit the timer is not queued and the handler is not running on any CPU. + * + * It must not be called from interrupt contexts. + */ +int try_to_del_timer_sync(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = -1; + + base = lock_timer_base(timer, &flags); + + if (base->running_timer == timer) + goto out; + + ret = 0; + if (timer_pending(timer)) { + detach_timer(timer, 1); + ret = 1; + } +out: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + +EXPORT_SYMBOL(try_to_del_timer_sync); + +/** + * del_timer_sync - deactivate a timer and wait for the handler to finish. + * @timer: the timer to be deactivated + * + * This function only differs from del_timer() on SMP: besides deactivating + * the timer it also makes sure the handler has finished executing on other + * CPUs. + * + * Synchronization rules: Callers must prevent restarting of the timer, + * otherwise this function is meaningless. It must not be called from + * interrupt contexts. The caller must not hold locks which would prevent + * completion of the timer's handler. The timer's handler must not call + * add_timer_on(). Upon exit the timer is not queued and the handler is + * not running on any CPU. + * + * The function returns whether it has deactivated a pending timer or not. + */ +int del_timer_sync(struct timer_list *timer) +{ + for (;;) { + int ret = try_to_del_timer_sync(timer); + if (ret >= 0) + return ret; + cpu_relax(); + } +} + +EXPORT_SYMBOL(del_timer_sync); +#endif + +static int cascade(struct tvec_base *base, struct tvec *tv, int index) +{ + /* cascade all the timers from tv up one level */ + struct timer_list *timer, *tmp; + struct list_head tv_list; + + list_replace_init(tv->vec + index, &tv_list); + + /* + * We are removing _all_ timers from the list, so we + * don't have to detach them individually. + */ + list_for_each_entry_safe(timer, tmp, &tv_list, entry) { + BUG_ON(tbase_get_base(timer->base) != base); + internal_add_timer(base, timer); + } + + return index; +} + +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) + +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + * + * This function cascades all vectors and executes all expired timer + * vectors. + */ +static inline void __run_timers(struct tvec_base *base) +{ + struct timer_list *timer; + + spin_lock_irq(&base->lock); + while (time_after_eq(jiffies, base->timer_jiffies)) { + struct list_head work_list; + struct list_head *head = &work_list; + int index = base->timer_jiffies & TVR_MASK; + + /* + * Cascade timers: + */ + if (!index && + (!cascade(base, &base->tv2, INDEX(0))) && + (!cascade(base, &base->tv3, INDEX(1))) && + !cascade(base, &base->tv4, INDEX(2))) + cascade(base, &base->tv5, INDEX(3)); + ++base->timer_jiffies; + list_replace_init(base->tv1.vec + index, &work_list); + while (!list_empty(head)) { + void (*fn)(unsigned long); + unsigned long data; + + timer = list_first_entry(head, struct timer_list,entry); + fn = timer->function; + data = timer->data; + + timer_stats_account_timer(timer); + + set_running_timer(base, timer); + detach_timer(timer, 1); + spin_unlock_irq(&base->lock); + { + int preempt_count = preempt_count(); + fn(data); + if (preempt_count != preempt_count()) { + printk(KERN_ERR "huh, entered %p " + "with preempt_count %08x, exited" + " with %08x?\n", + fn, preempt_count, + preempt_count()); + BUG(); + } + } + spin_lock_irq(&base->lock); + } + } + set_running_timer(base, NULL); + spin_unlock_irq(&base->lock); +} + +#ifdef CONFIG_NO_HZ +/* + * Find out when the next timer event is due to happen. This + * is used on S/390 to stop all activity when a cpus is idle. + * This functions needs to be called disabled. + */ +static unsigned long __next_timer_interrupt(struct tvec_base *base) +{ + unsigned long timer_jiffies = base->timer_jiffies; + unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; + int index, slot, array, found = 0; + struct timer_list *nte; + struct tvec *varray[4]; + + /* Look for timer events in tv1. */ + index = slot = timer_jiffies & TVR_MASK; + do { + list_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + + found = 1; + expires = nte->expires; + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + goto cascade; + return expires; + } + slot = (slot + 1) & TVR_MASK; + } while (slot != index); + +cascade: + /* Calculate the next cascade event */ + if (index) + timer_jiffies += TVR_SIZE - index; + timer_jiffies >>= TVR_BITS; + + /* Check tv2-tv5. */ + varray[0] = &base->tv2; + varray[1] = &base->tv3; + varray[2] = &base->tv4; + varray[3] = &base->tv5; + + for (array = 0; array < 4; array++) { + struct tvec *varp = varray[array]; + + index = slot = timer_jiffies & TVN_MASK; + do { + list_for_each_entry(nte, varp->vec + slot, entry) { + found = 1; + if (time_before(nte->expires, expires)) + expires = nte->expires; + } + /* + * Do we still search for the first timer or are + * we looking up the cascade buckets ? + */ + if (found) { + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + break; + return expires; + } + slot = (slot + 1) & TVN_MASK; + } while (slot != index); + + if (index) + timer_jiffies += TVN_SIZE - index; + timer_jiffies >>= TVN_BITS; + } + return expires; +} + +/* + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +static unsigned long cmp_next_hrtimer_event(unsigned long now, + unsigned long expires) +{ + ktime_t hr_delta = hrtimer_get_next_event(); + struct timespec tsdelta; + unsigned long delta; + + if (hr_delta.tv64 == KTIME_MAX) + return expires; + + /* + * Expired timer available, let it expire in the next tick + */ + if (hr_delta.tv64 <= 0) + return now + 1; + + tsdelta = ktime_to_timespec(hr_delta); + delta = timespec_to_jiffies(&tsdelta); + + /* + * Limit the delta to the max value, which is checked in + * tick_nohz_stop_sched_tick(): + */ + if (delta > NEXT_TIMER_MAX_DELTA) + delta = NEXT_TIMER_MAX_DELTA; + + /* + * Take rounding errors in to account and make sure, that it + * expires in the next tick. Otherwise we go into an endless + * ping pong due to tick_nohz_stop_sched_tick() retriggering + * the timer softirq + */ + if (delta < 1) + delta = 1; + now += delta; + if (time_before(now, expires)) + return now; + return expires; +} + +/** + * get_next_timer_interrupt - return the jiffy of the next pending timer + * @now: current time (in jiffies) + */ +unsigned long get_next_timer_interrupt(unsigned long now) +{ + struct tvec_base *base = __get_cpu_var(tvec_bases); + unsigned long expires; + + spin_lock(&base->lock); + expires = __next_timer_interrupt(base); + spin_unlock(&base->lock); + + if (time_before_eq(expires, now)) + return now; + + return cmp_next_hrtimer_event(now, expires); +} +#endif + +/* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + int cpu = smp_processor_id(); + + /* Note: this timer irq context must be accounted for as well. */ + account_process_tick(p, user_tick); + run_local_timers(); + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, user_tick); + printk_tick(); + scheduler_tick(); + run_posix_cpu_timers(p); +} + +/* + * Nr of active tasks - counted in fixed-point numbers + */ +static unsigned long count_active_tasks(void) +{ + return nr_active() * FIXED_1; +} + +/* + * Hmm.. Changed this, as the GNU make sources (load.c) seems to + * imply that avenrun[] is the standard name for this kind of thing. + * Nothing else seems to be standardized: the fractional size etc + * all seem to differ on different machines. + * + * Requires xtime_lock to access. + */ +unsigned long avenrun[3]; + +EXPORT_SYMBOL(avenrun); + +/* + * calc_load - given tick count, update the avenrun load estimates. + * This is called while holding a write_lock on xtime_lock. + */ +static inline void calc_load(unsigned long ticks) +{ + unsigned long active_tasks; /* fixed-point */ + static int count = LOAD_FREQ; + + count -= ticks; + if (unlikely(count < 0)) { + active_tasks = count_active_tasks(); + do { + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); + count += LOAD_FREQ; + } while (count < 0); + } +} + +/* + * This function runs timers and the timer-tq in bottom half context. + */ +static void run_timer_softirq(struct softirq_action *h) +{ + struct tvec_base *base = __get_cpu_var(tvec_bases); + + hrtimer_run_pending(); + + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); +} + +/* + * Called by the local, per-CPU timer interrupt on SMP. + */ +void run_local_timers(void) +{ + hrtimer_run_queues(); + raise_softirq(TIMER_SOFTIRQ); + softlockup_tick(); +} + +/* + * Called by the timer interrupt. xtime_lock must already be taken + * by the timer IRQ! + */ +static inline void update_times(unsigned long ticks) +{ + update_wall_time(); + calc_load(ticks); +} + +/* + * The 64-bit jiffies value is not atomic - you MUST NOT read it + * without sampling the sequence number in xtime_lock. + * jiffies is defined in the linker script... + */ + +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + update_times(ticks); +} + +#ifdef __ARCH_WANT_SYS_ALARM + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +SYSCALL_DEFINE1(alarm, unsigned int, seconds) +{ + return alarm_setitimer(seconds); +} + +#endif + +#ifndef __alpha__ + +/* + * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this + * should be moved into arch/i386 instead? + */ + +/** + * sys_getpid - return the thread group id of the current process + * + * Note, despite the name, this returns the tgid not the pid. The tgid and + * the pid are identical unless CLONE_THREAD was specified on clone() in + * which case the tgid is the same in all threads of the same group. + * + * This is SMP safe as current->tgid does not change. + */ +SYSCALL_DEFINE0(getpid) +{ + return task_tgid_vnr(current); +} + +/* + * Accessing ->real_parent is not SMP-safe, it could + * change from under us. However, we can use a stale + * value of ->real_parent under rcu_read_lock(), see + * release_task()->call_rcu(delayed_put_task_struct). + */ +SYSCALL_DEFINE0(getppid) +{ + int pid; + + rcu_read_lock(); + pid = task_tgid_vnr(current->real_parent); + rcu_read_unlock(); + + return pid; +} + +SYSCALL_DEFINE0(getuid) +{ + /* Only we change this so SMP safe */ + return current_uid(); +} + +SYSCALL_DEFINE0(geteuid) +{ + /* Only we change this so SMP safe */ + return current_euid(); +} + +SYSCALL_DEFINE0(getgid) +{ + /* Only we change this so SMP safe */ + return current_gid(); +} + +SYSCALL_DEFINE0(getegid) +{ + /* Only we change this so SMP safe */ + return current_egid(); +} + +#endif + +static void process_timeout(unsigned long __data) +{ + wake_up_process((struct task_struct *)__data); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * In all cases the return value is guaranteed to be non-negative. + */ +signed long __sched schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx\n", timeout); + dump_stack(); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); + __mod_timer(&timer, expire); + schedule(); + del_singleshot_timer_sync(&timer); + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} +EXPORT_SYMBOL(schedule_timeout); + +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_killable(signed long timeout) +{ + __set_current_state(TASK_KILLABLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_killable); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + +/* Thread ID - the internal kernel "pid" */ +SYSCALL_DEFINE0(gettid) +{ + return task_pid_vnr(current); +} + +/** + * do_sysinfo - fill in sysinfo struct + * @info: pointer to buffer to fill + */ +int do_sysinfo(struct sysinfo *info) +{ + unsigned long mem_total, sav_total; + unsigned int mem_unit, bitcount; + unsigned long seq; + + memset(info, 0, sizeof(struct sysinfo)); + + do { + struct timespec tp; + seq = read_seqbegin(&xtime_lock); + + /* + * This is annoying. The below is the same thing + * posix_get_clock_monotonic() does, but it wants to + * take the lock which we want to cover the loads stuff + * too. + */ + + getnstimeofday(&tp); + tp.tv_sec += wall_to_monotonic.tv_sec; + tp.tv_nsec += wall_to_monotonic.tv_nsec; + monotonic_to_bootbased(&tp); + if (tp.tv_nsec - NSEC_PER_SEC >= 0) { + tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; + tp.tv_sec++; + } + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + + info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + + info->procs = nr_threads; + } while (read_seqretry(&xtime_lock, seq)); + + si_meminfo(info); + si_swapinfo(info); + + /* + * If the sum of all the available memory (i.e. ram + swap) + * is less than can be stored in a 32 bit unsigned long then + * we can be binary compatible with 2.2.x kernels. If not, + * well, in that case 2.2.x was broken anyways... + * + * -Erik Andersen <andersee@debian.org> + */ + + mem_total = info->totalram + info->totalswap; + if (mem_total < info->totalram || mem_total < info->totalswap) + goto out; + bitcount = 0; + mem_unit = info->mem_unit; + while (mem_unit > 1) { + bitcount++; + mem_unit >>= 1; + sav_total = mem_total; + mem_total <<= 1; + if (mem_total < sav_total) + goto out; + } + + /* + * If mem_total did not overflow, multiply all memory values by + * info->mem_unit and set it to 1. This leaves things compatible + * with 2.2.x, and also retains compatibility with earlier 2.4.x + * kernels... + */ + + info->mem_unit = 1; + info->totalram <<= bitcount; + info->freeram <<= bitcount; + info->sharedram <<= bitcount; + info->bufferram <<= bitcount; + info->totalswap <<= bitcount; + info->freeswap <<= bitcount; + info->totalhigh <<= bitcount; + info->freehigh <<= bitcount; + +out: + return 0; +} + +SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) +{ + struct sysinfo val; + + do_sysinfo(&val); + + if (copy_to_user(info, &val, sizeof(struct sysinfo))) + return -EFAULT; + + return 0; +} + +static int __cpuinit init_timers_cpu(int cpu) +{ + int j; + struct tvec_base *base; + static char __cpuinitdata tvec_base_done[NR_CPUS]; + + if (!tvec_base_done[cpu]) { + static char boot_done; + + if (boot_done) { + /* + * The APs use this path later in boot + */ + base = kmalloc_node(sizeof(*base), + GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); + if (!base) + return -ENOMEM; + + /* Make sure that tvec_base is 2 byte aligned */ + if (tbase_get_deferrable(base)) { + WARN_ON(1); + kfree(base); + return -ENOMEM; + } + per_cpu(tvec_bases, cpu) = base; + } else { + /* + * This is for the boot CPU - we use compile-time + * static initialisation because per-cpu memory isn't + * ready yet and because the memory allocators are not + * initialised either. + */ + boot_done = 1; + base = &boot_tvec_bases; + } + tvec_base_done[cpu] = 1; + } else { + base = per_cpu(tvec_bases, cpu); + } + + spin_lock_init(&base->lock); + + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD(base->tv5.vec + j); + INIT_LIST_HEAD(base->tv4.vec + j); + INIT_LIST_HEAD(base->tv3.vec + j); + INIT_LIST_HEAD(base->tv2.vec + j); + } + for (j = 0; j < TVR_SIZE; j++) + INIT_LIST_HEAD(base->tv1.vec + j); + + base->timer_jiffies = jiffies; + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) +{ + struct timer_list *timer; + + while (!list_empty(head)) { + timer = list_first_entry(head, struct timer_list, entry); + detach_timer(timer, 0); + timer_set_base(timer, new_base); + internal_add_timer(new_base, timer); + } +} + +static void __cpuinit migrate_timers(int cpu) +{ + struct tvec_base *old_base; + struct tvec_base *new_base; + int i; + + BUG_ON(cpu_online(cpu)); + old_base = per_cpu(tvec_bases, cpu); + new_base = get_cpu_var(tvec_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + BUG_ON(old_base->running_timer); + + for (i = 0; i < TVR_SIZE; i++) + migrate_timer_list(new_base, old_base->tv1.vec + i); + for (i = 0; i < TVN_SIZE; i++) { + migrate_timer_list(new_base, old_base->tv2.vec + i); + migrate_timer_list(new_base, old_base->tv3.vec + i); + migrate_timer_list(new_base, old_base->tv4.vec + i); + migrate_timer_list(new_base, old_base->tv5.vec + i); + } + + spin_unlock(&old_base->lock); + spin_unlock_irq(&new_base->lock); + put_cpu_var(tvec_bases); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __cpuinit timer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + switch(action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (init_timers_cpu(cpu) < 0) + return NOTIFY_BAD; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + migrate_timers(cpu); + break; +#endif + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata timers_nb = { + .notifier_call = timer_cpu_notify, +}; + + +void __init init_timers(void) +{ + int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + + init_timer_stats(); + + BUG_ON(err == NOTIFY_BAD); + register_cpu_notifier(&timers_nb); + open_softirq(TIMER_SOFTIRQ, run_timer_softirq); +} + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +} + +EXPORT_SYMBOL(msleep); +#endif /* DDE */ + +/** + * msleep_interruptible - sleep waiting for signals + * @msecs: Time in milliseconds to sleep for + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); + return jiffies_to_msecs(timeout); +} + +EXPORT_SYMBOL(msleep_interruptible); diff --git a/libdde-linux26/lib/src/kernel/wait.c b/libdde-linux26/lib/src/kernel/wait.c new file mode 100644 index 00000000..b10d867f --- /dev/null +++ b/libdde-linux26/lib/src/kernel/wait.c @@ -0,0 +1,301 @@ +/* + * Generic waiting primitives. + * + * (C) 2004 William Irwin, Oracle + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/wait.h> +#include <linux/hash.h> + +#ifdef DDE_LINUX +#include "local.h" +#endif + +void init_waitqueue_head(wait_queue_head_t *q) +{ + spin_lock_init(&q->lock); + INIT_LIST_HEAD(&q->task_list); +} + +EXPORT_SYMBOL(init_waitqueue_head); + +void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue); + +void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue_tail(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive); + +void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __remove_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(remove_wait_queue); + + +/* + * Note: we use "set_current_state()" _after_ the wait-queue add, + * because we need a memory barrier there on SMP, so that any + * wake-function that tests for the wait-queue being active + * will be guaranteed to see waitqueue addition _or_ subsequent + * tests in this thread will see the wakeup having taken place. + * + * The spin_unlock() itself is semi-permeable and only protects + * one way (it only protects stuff inside the critical region and + * stops them from bleeding out - it would still allow subsequent + * loads to move into the critical region). + */ +void +prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait); + +void +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive); + +/* + * finish_wait - clean up after waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + */ +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + /* + * We can check for list emptiness outside the lock + * IFF: + * - we use the "careful" check that verifies both + * the next and prev pointers, so that there cannot + * be any half-pending updates in progress on other + * CPU's that we haven't seen yet (and that might + * still change the stack area. + * and + * - all other users take the lock (ie we can only + * have _one_ other CPU that looks at or modifies + * the list). + */ + if (!list_empty_careful(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_wait); + +/* + * abort_exclusive_wait - abort exclusive waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * @state: runstate of the waiter to be woken + * @key: key to identify a wait bit queue or %NULL + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + * + * Wakes up the next waiter if the caller is concurrently + * woken up through the queue. + * + * This prevents waiter starvation where an exclusive waiter + * aborts and is woken up concurrently and noone wakes up + * the next waiter. + */ +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, + unsigned int mode, void *key) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); + else if (waitqueue_active(q)) + __wake_up_common(q, mode, 1, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(abort_exclusive_wait); + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + + if (ret) + list_del_init(&wait->task_list); + return ret; +} +EXPORT_SYMBOL(autoremove_wake_function); + +int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + test_bit(key->bit_nr, key->flags)) + return 0; + else + return autoremove_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched +__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + if (test_bit(q->key.bit_nr, q->key.flags)) + ret = (*action)(q->key.flags); + } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); + finish_wait(wq, &q->wait); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched out_of_line_wait_on_bit(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched +__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + do { + int ret; + + prepare_to_wait_exclusive(wq, &q->wait, mode); + if (!test_bit(q->key.bit_nr, q->key.flags)) + continue; + ret = action(q->key.flags); + if (!ret) + continue; + abort_exclusive_wait(wq, &q->wait, mode, &q->key); + return ret; + } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); + finish_wait(wq, &q->wait); + return 0; +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit_lock(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) +{ +#ifndef DDE_LINUX + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &key); +#else + WARN_UNIMPL; +#endif +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_clear_bit(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void wake_up_bit(void *word, int bit) +{ + __wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ +#ifndef DDE_LINUX + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + const struct zone *zone = page_zone(virt_to_page(word)); + unsigned long val = (unsigned long)word << shift | bit; + + return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; +#else + WARN_UNIMPL; + return NULL; +#endif +} +EXPORT_SYMBOL(bit_waitqueue); diff --git a/libdde-linux26/lib/src/kernel/workqueue.c b/libdde-linux26/lib/src/kernel/workqueue.c new file mode 100644 index 00000000..5ad26d9f --- /dev/null +++ b/libdde-linux26/lib/src/kernel/workqueue.c @@ -0,0 +1,1038 @@ +/* + * linux/kernel/workqueue.c + * + * Generic mechanism for defining kernel helper threads for running + * arbitrary tasks in process context. + * + * Started by Ingo Molnar, Copyright (C) 2002 + * + * Derived from the taskqueue/keventd code by: + * + * David Woodhouse <dwmw2@infradead.org> + * Andrew Morton + * Kai Petzke <wpp@marie.physik.tu-berlin.de> + * Theodore Ts'o <tytso@mit.edu> + * + * Made to use alloc_percpu by Christoph Lameter. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/signal.h> +#include <linux/completion.h> +#include <linux/workqueue.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/notifier.h> +#include <linux/kthread.h> +#include <linux/hardirq.h> +#include <linux/mempolicy.h> +#include <linux/freezer.h> +#include <linux/kallsyms.h> +#include <linux/debug_locks.h> +#include <linux/lockdep.h> + +#ifdef DDE_LINUX +#include "local.h" +#endif + +/* + * The per-CPU workqueue (if single thread, we always use the first + * possible cpu). + */ +struct cpu_workqueue_struct { + + spinlock_t lock; + + struct list_head worklist; + wait_queue_head_t more_work; + struct work_struct *current_work; + + struct workqueue_struct *wq; + struct task_struct *thread; + + int run_depth; /* Detect run_workqueue() recursion depth */ +} ____cacheline_aligned; + +/* + * The externally visible workqueue abstraction is an array of + * per-CPU workqueues: + */ +struct workqueue_struct { + struct cpu_workqueue_struct *cpu_wq; + struct list_head list; + const char *name; + int singlethread; + int freezeable; /* Freeze threads during suspend */ + int rt; +#ifdef CONFIG_LOCKDEP + struct lockdep_map lockdep_map; +#endif +}; + +/* Serializes the accesses to the list of workqueues. */ +static DEFINE_SPINLOCK(workqueue_lock); +static LIST_HEAD(workqueues); + +static int singlethread_cpu __read_mostly; +static const struct cpumask *cpu_singlethread_map __read_mostly; +/* + * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD + * flushes cwq->worklist. This means that flush_workqueue/wait_on_work + * which comes in between can't use for_each_online_cpu(). We could + * use cpu_possible_map, the cpumask below is more a documentation + * than optimization. + */ +static cpumask_var_t cpu_populated_map __read_mostly; + +/* If it's single threaded, it isn't in the list of workqueues. */ +static inline int is_wq_single_threaded(struct workqueue_struct *wq) +{ + return wq->singlethread; +} + +static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) +{ + return is_wq_single_threaded(wq) + ? cpu_singlethread_map : cpu_populated_map; +} + +static +struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) +{ + if (unlikely(is_wq_single_threaded(wq))) + cpu = singlethread_cpu; + return per_cpu_ptr(wq->cpu_wq, cpu); +} + +/* + * Set the workqueue on which a work item is to be run + * - Must *only* be called if the pending flag is set + */ +static inline void set_wq_data(struct work_struct *work, + struct cpu_workqueue_struct *cwq) +{ + unsigned long new; + + BUG_ON(!work_pending(work)); + + new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); + new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); + atomic_long_set(&work->data, new); +} + +static inline +struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) +{ + return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); +} + +static void insert_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work, struct list_head *head) +{ + set_wq_data(work, cwq); + /* + * Ensure that we get the right work->data if we see the + * result of list_add() below, see try_to_grab_pending(). + */ + smp_wmb(); + list_add_tail(&work->entry, head); + wake_up(&cwq->more_work); +} + +static void __queue_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work) +{ + unsigned long flags; + + spin_lock_irqsave(&cwq->lock, flags); + insert_work(cwq, work, &cwq->worklist); + spin_unlock_irqrestore(&cwq->lock, flags); +} + +/** + * queue_work - queue work on a workqueue + * @wq: workqueue to use + * @work: work to queue + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. + */ +int queue_work(struct workqueue_struct *wq, struct work_struct *work) +{ + int ret; + + ret = queue_work_on(get_cpu(), wq, work); + put_cpu(); + + return ret; +} +EXPORT_SYMBOL_GPL(queue_work); + +/** + * queue_work_on - queue work on specific cpu + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to a specific CPU, the caller must ensure it + * can't go away. + */ +int +queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) +{ + int ret = 0; + + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { + BUG_ON(!list_empty(&work->entry)); + __queue_work(wq_per_cpu(wq, cpu), work); + ret = 1; + } + return ret; +} +EXPORT_SYMBOL_GPL(queue_work_on); + +static void delayed_work_timer_fn(unsigned long __data) +{ + struct delayed_work *dwork = (struct delayed_work *)__data; + struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); + struct workqueue_struct *wq = cwq->wq; + + __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); +} + +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @dwork: delayable work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + */ +int queue_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + if (delay == 0) + return queue_work(wq, &dwork->work); + + return queue_delayed_work_on(-1, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(queue_delayed_work); + +/** + * queue_delayed_work_on - queue work on specific CPU after delay + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + */ +int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + int ret = 0; + struct timer_list *timer = &dwork->timer; + struct work_struct *work = &dwork->work; + + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { + BUG_ON(timer_pending(timer)); + BUG_ON(!list_empty(&work->entry)); + + timer_stats_timer_set_start_info(&dwork->timer); + + /* This stores cwq for the moment, for the timer_fn */ + set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); + timer->expires = jiffies + delay; + timer->data = (unsigned long)dwork; + timer->function = delayed_work_timer_fn; + + if (unlikely(cpu >= 0)) + add_timer_on(timer, cpu); + else + add_timer(timer); + ret = 1; + } + return ret; +} +EXPORT_SYMBOL_GPL(queue_delayed_work_on); + +static void run_workqueue(struct cpu_workqueue_struct *cwq) +{ + spin_lock_irq(&cwq->lock); + cwq->run_depth++; + if (cwq->run_depth > 3) { + /* morton gets to eat his hat */ + printk("%s: recursion depth exceeded: %d\n", + __func__, cwq->run_depth); + dump_stack(); + } + while (!list_empty(&cwq->worklist)) { + struct work_struct *work = list_entry(cwq->worklist.next, + struct work_struct, entry); + work_func_t f = work->func; +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the struct work_struct + * from inside the function that is called from it, + * this we need to take into account for lockdep too. + * To avoid bogus "held lock freed" warnings as well + * as problems when looking into work->lockdep_map, + * make a copy and use that here. + */ + struct lockdep_map lockdep_map = work->lockdep_map; +#endif + + cwq->current_work = work; + list_del_init(cwq->worklist.next); + spin_unlock_irq(&cwq->lock); + + BUG_ON(get_wq_data(work) != cwq); + work_clear_pending(work); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); + f(work); + lock_map_release(&lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), + task_pid_nr(current)); +#ifndef DDE_LINUX + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)f); + debug_show_held_locks(current); + dump_stack(); +#endif /* DDE_LINUX */ + } + + spin_lock_irq(&cwq->lock); + cwq->current_work = NULL; + } + cwq->run_depth--; + spin_unlock_irq(&cwq->lock); +} + +static int worker_thread(void *__cwq) +{ + struct cpu_workqueue_struct *cwq = __cwq; + DEFINE_WAIT(wait); + + if (cwq->wq->freezeable) + set_freezable(); + + set_user_nice(current, -5); + + for (;;) { + prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); + if (!freezing(current) && + !kthread_should_stop() && + list_empty(&cwq->worklist)) + schedule(); + finish_wait(&cwq->more_work, &wait); + + try_to_freeze(); + + if (kthread_should_stop()) + break; + + run_workqueue(cwq); + } + + return 0; +} + +struct wq_barrier { + struct work_struct work; + struct completion done; +}; + +static void wq_barrier_func(struct work_struct *work) +{ + struct wq_barrier *barr = container_of(work, struct wq_barrier, work); + complete(&barr->done); +} + +static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, + struct wq_barrier *barr, struct list_head *head) +{ + INIT_WORK(&barr->work, wq_barrier_func); + __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); + + init_completion(&barr->done); + + insert_work(cwq, &barr->work, head); +} + +static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) +{ + int active; + + if (cwq->thread == current) { + /* + * Probably keventd trying to flush its own queue. So simply run + * it by hand rather than deadlocking. + */ + run_workqueue(cwq); + active = 1; + } else { + struct wq_barrier barr; + + active = 0; + spin_lock_irq(&cwq->lock); + if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { + insert_wq_barrier(cwq, &barr, &cwq->worklist); + active = 1; + } + spin_unlock_irq(&cwq->lock); + + if (active) + wait_for_completion(&barr.done); + } + + return active; +} + +/** + * flush_workqueue - ensure that any scheduled work has run to completion. + * @wq: workqueue to flush + * + * Forces execution of the workqueue and blocks until its completion. + * This is typically used in driver shutdown handlers. + * + * We sleep until all works which were queued on entry have been handled, + * but we are not livelocked by new incoming ones. + * + * This function used to run the workqueues itself. Now we just wait for the + * helper threads to do it. + */ +void flush_workqueue(struct workqueue_struct *wq) +{ + const struct cpumask *cpu_map = wq_cpu_map(wq); + int cpu; + + might_sleep(); + lock_map_acquire(&wq->lockdep_map); + lock_map_release(&wq->lockdep_map); + for_each_cpu_mask_nr(cpu, *cpu_map) + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); +} +EXPORT_SYMBOL_GPL(flush_workqueue); + +/** + * flush_work - block until a work_struct's callback has terminated + * @work: the work which is to be flushed + * + * Returns false if @work has already terminated. + * + * It is expected that, prior to calling flush_work(), the caller has + * arranged for the work to not be requeued, otherwise it doesn't make + * sense to use this function. + */ +int flush_work(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq; + struct list_head *prev; + struct wq_barrier barr; + + might_sleep(); + cwq = get_wq_data(work); + if (!cwq) + return 0; + + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + prev = NULL; + spin_lock_irq(&cwq->lock); + if (!list_empty(&work->entry)) { + /* + * See the comment near try_to_grab_pending()->smp_rmb(). + * If it was re-queued under us we are not going to wait. + */ + smp_rmb(); + if (unlikely(cwq != get_wq_data(work))) + goto out; + prev = &work->entry; + } else { + if (cwq->current_work != work) + goto out; + prev = &cwq->worklist; + } + insert_wq_barrier(cwq, &barr, prev->next); +out: + spin_unlock_irq(&cwq->lock); + if (!prev) + return 0; + + wait_for_completion(&barr.done); + return 1; +} +EXPORT_SYMBOL_GPL(flush_work); + +/* + * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, + * so this work can't be re-armed in any way. + */ +static int try_to_grab_pending(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq; + int ret = -1; + + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) + return 0; + + /* + * The queueing is in progress, or it is already queued. Try to + * steal it from ->worklist without clearing WORK_STRUCT_PENDING. + */ + + cwq = get_wq_data(work); + if (!cwq) + return ret; + + spin_lock_irq(&cwq->lock); + if (!list_empty(&work->entry)) { + /* + * This work is queued, but perhaps we locked the wrong cwq. + * In that case we must see the new value after rmb(), see + * insert_work()->wmb(). + */ + smp_rmb(); + if (cwq == get_wq_data(work)) { + list_del_init(&work->entry); + ret = 1; + } + } + spin_unlock_irq(&cwq->lock); + + return ret; +} + +static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work) +{ + struct wq_barrier barr; + int running = 0; + + spin_lock_irq(&cwq->lock); + if (unlikely(cwq->current_work == work)) { + insert_wq_barrier(cwq, &barr, cwq->worklist.next); + running = 1; + } + spin_unlock_irq(&cwq->lock); + + if (unlikely(running)) + wait_for_completion(&barr.done); +} + +static void wait_on_work(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq; + struct workqueue_struct *wq; + const struct cpumask *cpu_map; + int cpu; + + might_sleep(); + + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + + cwq = get_wq_data(work); + if (!cwq) + return; + + wq = cwq->wq; + cpu_map = wq_cpu_map(wq); + + for_each_cpu_mask_nr(cpu, *cpu_map) + wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); +} + +static int __cancel_work_timer(struct work_struct *work, + struct timer_list* timer) +{ + int ret; + + do { + ret = (timer && likely(del_timer(timer))); + if (!ret) + ret = try_to_grab_pending(work); + wait_on_work(work); + } while (unlikely(ret < 0)); + + work_clear_pending(work); + return ret; +} + +/** + * cancel_work_sync - block until a work_struct's callback has terminated + * @work: the work which is to be flushed + * + * Returns true if @work was pending. + * + * cancel_work_sync() will cancel the work if it is queued. If the work's + * callback appears to be running, cancel_work_sync() will block until it + * has completed. + * + * It is possible to use this function if the work re-queues itself. It can + * cancel the work even if it migrates to another workqueue, however in that + * case it only guarantees that work->func() has completed on the last queued + * workqueue. + * + * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not + * pending, otherwise it goes into a busy-wait loop until the timer expires. + * + * The caller must ensure that workqueue_struct on which this work was last + * queued can't be destroyed before this function returns. + */ +int cancel_work_sync(struct work_struct *work) +{ + return __cancel_work_timer(work, NULL); +} +EXPORT_SYMBOL_GPL(cancel_work_sync); + +/** + * cancel_delayed_work_sync - reliably kill off a delayed work. + * @dwork: the delayed work struct + * + * Returns true if @dwork was pending. + * + * It is possible to use this function if @dwork rearms itself via queue_work() + * or queue_delayed_work(). See also the comment for cancel_work_sync(). + */ +int cancel_delayed_work_sync(struct delayed_work *dwork) +{ + return __cancel_work_timer(&dwork->work, &dwork->timer); +} +EXPORT_SYMBOL(cancel_delayed_work_sync); + +static struct workqueue_struct *keventd_wq __read_mostly; + +/** + * schedule_work - put work task in global workqueue + * @work: job to be done + * + * This puts a job in the kernel-global workqueue. + */ +int schedule_work(struct work_struct *work) +{ + return queue_work(keventd_wq, work); +} +EXPORT_SYMBOL(schedule_work); + +/* + * schedule_work_on - put work task on a specific cpu + * @cpu: cpu to put the work task on + * @work: job to be done + * + * This puts a job on a specific cpu + */ +int schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, keventd_wq, work); +} +EXPORT_SYMBOL(schedule_work_on); + +/** + * schedule_delayed_work - put work task in global workqueue after delay + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ +int schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work(keventd_wq, dwork, delay); +} +EXPORT_SYMBOL(schedule_delayed_work); + +/** + * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * @cpu: cpu to use + * @dwork: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue on the specified CPU. + */ +int schedule_delayed_work_on(int cpu, + struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); +} +EXPORT_SYMBOL(schedule_delayed_work_on); + +/** + * schedule_on_each_cpu - call a function on each online CPU from keventd + * @func: the function to call + * + * Returns zero on success. + * Returns -ve errno on failure. + * + * schedule_on_each_cpu() is very slow. + */ +int schedule_on_each_cpu(work_func_t func) +{ + int cpu; + struct work_struct *works; + + works = alloc_percpu(struct work_struct); + if (!works) + return -ENOMEM; + + get_online_cpus(); + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, func); + schedule_work_on(cpu, work); + } + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(works, cpu)); + put_online_cpus(); + free_percpu(works); + return 0; +} + +void flush_scheduled_work(void) +{ + flush_workqueue(keventd_wq); +} +EXPORT_SYMBOL(flush_scheduled_work); + +/** + * execute_in_process_context - reliably execute the routine with user context + * @fn: the function to execute + * @ew: guaranteed storage for the execute work structure (must + * be available when the work executes) + * + * Executes the function immediately if process context is available, + * otherwise schedules the function for delayed execution. + * + * Returns: 0 - function was executed + * 1 - function was scheduled for execution + */ +int execute_in_process_context(work_func_t fn, struct execute_work *ew) +{ + if (!in_interrupt()) { + fn(&ew->work); + return 0; + } + + INIT_WORK(&ew->work, fn); + schedule_work(&ew->work); + + return 1; +} +EXPORT_SYMBOL_GPL(execute_in_process_context); + +int keventd_up(void) +{ + return keventd_wq != NULL; +} + +int current_is_keventd(void) +{ + struct cpu_workqueue_struct *cwq; + int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ + int ret = 0; + + BUG_ON(!keventd_wq); + + cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); + if (current == cwq->thread) + ret = 1; + + return ret; + +} + +static struct cpu_workqueue_struct * +init_cpu_workqueue(struct workqueue_struct *wq, int cpu) +{ + struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); + + cwq->wq = wq; + spin_lock_init(&cwq->lock); + INIT_LIST_HEAD(&cwq->worklist); + init_waitqueue_head(&cwq->more_work); + + return cwq; +} + +static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct workqueue_struct *wq = cwq->wq; + const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; + struct task_struct *p; + + p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); + /* + * Nobody can add the work_struct to this cwq, + * if (caller is __create_workqueue) + * nobody should see this wq + * else // caller is CPU_UP_PREPARE + * cpu is not on cpu_online_map + * so we can abort safely. + */ + if (IS_ERR(p)) + return PTR_ERR(p); + if (cwq->wq->rt) + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); + cwq->thread = p; + + return 0; +} + +static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ + struct task_struct *p = cwq->thread; + + if (p != NULL) { + if (cpu >= 0) + kthread_bind(p, cpu); + wake_up_process(p); + } +} + +struct workqueue_struct *__create_workqueue_key(const char *name, + int singlethread, + int freezeable, + int rt, + struct lock_class_key *key, + const char *lock_name) +{ + struct workqueue_struct *wq; + struct cpu_workqueue_struct *cwq; + int err = 0, cpu; + + wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) + return NULL; + + wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); + if (!wq->cpu_wq) { + kfree(wq); + return NULL; + } + + wq->name = name; + lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); + wq->singlethread = singlethread; + wq->freezeable = freezeable; + wq->rt = rt; + INIT_LIST_HEAD(&wq->list); + + if (singlethread) { + cwq = init_cpu_workqueue(wq, singlethread_cpu); + err = create_workqueue_thread(cwq, singlethread_cpu); + start_workqueue_thread(cwq, -1); + } else { + cpu_maps_update_begin(); + /* + * We must place this wq on list even if the code below fails. + * cpu_down(cpu) can remove cpu from cpu_populated_map before + * destroy_workqueue() takes the lock, in that case we leak + * cwq[cpu]->thread. + */ + spin_lock(&workqueue_lock); + list_add(&wq->list, &workqueues); + spin_unlock(&workqueue_lock); + /* + * We must initialize cwqs for each possible cpu even if we + * are going to call destroy_workqueue() finally. Otherwise + * cpu_up() can hit the uninitialized cwq once we drop the + * lock. + */ + for_each_possible_cpu(cpu) { + cwq = init_cpu_workqueue(wq, cpu); + if (err || !cpu_online(cpu)) + continue; + err = create_workqueue_thread(cwq, cpu); + start_workqueue_thread(cwq, cpu); + } + cpu_maps_update_done(); + } + + if (err) { + destroy_workqueue(wq); + wq = NULL; + } + return wq; +} +EXPORT_SYMBOL_GPL(__create_workqueue_key); + +static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) +{ + /* + * Our caller is either destroy_workqueue() or CPU_POST_DEAD, + * cpu_add_remove_lock protects cwq->thread. + */ + if (cwq->thread == NULL) + return; + + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + flush_cpu_workqueue(cwq); + /* + * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, + * a concurrent flush_workqueue() can insert a barrier after us. + * However, in that case run_workqueue() won't return and check + * kthread_should_stop() until it flushes all work_struct's. + * When ->worklist becomes empty it is safe to exit because no + * more work_structs can be queued on this cwq: flush_workqueue + * checks list_empty(), and a "normal" queue_work() can't use + * a dead CPU. + */ + kthread_stop(cwq->thread); + cwq->thread = NULL; +} + +/** + * destroy_workqueue - safely terminate a workqueue + * @wq: target workqueue + * + * Safely destroy a workqueue. All work currently pending will be done first. + */ +void destroy_workqueue(struct workqueue_struct *wq) +{ + const struct cpumask *cpu_map = wq_cpu_map(wq); + int cpu; + + cpu_maps_update_begin(); + spin_lock(&workqueue_lock); + list_del(&wq->list); + spin_unlock(&workqueue_lock); + + for_each_cpu_mask_nr(cpu, *cpu_map) + cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); + cpu_maps_update_done(); + + free_percpu(wq->cpu_wq); + kfree(wq); +} +EXPORT_SYMBOL_GPL(destroy_workqueue); + +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct cpu_workqueue_struct *cwq; + struct workqueue_struct *wq; + int ret = NOTIFY_OK; + + action &= ~CPU_TASKS_FROZEN; + + switch (action) { + case CPU_UP_PREPARE: + cpumask_set_cpu(cpu, cpu_populated_map); + } +undo: + list_for_each_entry(wq, &workqueues, list) { + cwq = per_cpu_ptr(wq->cpu_wq, cpu); + + switch (action) { + case CPU_UP_PREPARE: + if (!create_workqueue_thread(cwq, cpu)) + break; + printk(KERN_ERR "workqueue [%s] for %i failed\n", + wq->name, cpu); + action = CPU_UP_CANCELED; + ret = NOTIFY_BAD; + goto undo; + + case CPU_ONLINE: + start_workqueue_thread(cwq, cpu); + break; + + case CPU_UP_CANCELED: + start_workqueue_thread(cwq, -1); + case CPU_POST_DEAD: + cleanup_workqueue_thread(cwq); + break; + } + } + + switch (action) { + case CPU_UP_CANCELED: + case CPU_POST_DEAD: + cpumask_clear_cpu(cpu, cpu_populated_map); + } + + return ret; +} + +#ifdef CONFIG_SMP +static struct workqueue_struct *work_on_cpu_wq __read_mostly; + +struct work_for_cpu { + struct work_struct work; + long (*fn)(void *); + void *arg; + long ret; +}; + +static void do_work_for_cpu(struct work_struct *w) +{ + struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work); + + wfc->ret = wfc->fn(wfc->arg); +} + +/** + * work_on_cpu - run a function in user context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function arg + * + * This will return the value @fn returns. + * It is up to the caller to ensure that the cpu doesn't go offline. + */ +long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +{ + struct work_for_cpu wfc; + + INIT_WORK(&wfc.work, do_work_for_cpu); + wfc.fn = fn; + wfc.arg = arg; + queue_work_on(cpu, work_on_cpu_wq, &wfc.work); + flush_work(&wfc.work); + + return wfc.ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu); +#endif /* CONFIG_SMP */ + +void __init init_workqueues(void) +{ + alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); + + cpumask_copy(cpu_populated_map, cpu_online_mask); + singlethread_cpu = cpumask_first(cpu_possible_mask); + cpu_singlethread_map = cpumask_of(singlethread_cpu); + hotcpu_notifier(workqueue_cpu_callback, 0); + keventd_wq = create_workqueue("events"); + BUG_ON(!keventd_wq); +#ifdef CONFIG_SMP + work_on_cpu_wq = create_workqueue("work_on_cpu"); + BUG_ON(!work_on_cpu_wq); +#endif +} + +#ifdef DDE_LINUX +core_initcall(init_workqueues); +#endif |