diff options
Diffstat (limited to 'libdde-linux26/lib/src')
69 files changed, 59279 insertions, 0 deletions
diff --git a/libdde-linux26/lib/src/Makefile b/libdde-linux26/lib/src/Makefile new file mode 100644 index 00000000..4f1ec099 --- /dev/null +++ b/libdde-linux26/lib/src/Makefile @@ -0,0 +1,243 @@ +PKGDIR ?= ../.. +L4DIR ?= $(PKGDIR) +CONTRIB ?= $(PKGDIR)/contrib + +include $(PKGDIR)/Makeconf.local +ifeq ($(origin SRC_DIR),undefined) +SRC_DIR := $(shell pwd) +endif + +ifeq ($(CONFIG_DDE26_COMMON),y) +TARGET += libdde_linux26.o.a +endif + +ifeq ($(CONFIG_DDE26_NET),y) +TARGET += libdde_linux26_net.a +endif + +ifeq ($(CONFIG_DDE26_BLOCK),y) +TARGET += libdde_linux26_block.a +endif + +ifeq ($(CONFIG_DDE26_SOUND),y) +TARGET += libdde_linux26_sound.a +endif + +ifeq ($(CONFIG_DDE26_CHAR),y) +TARGET += libdde_linux26_char.a +endif + +ifeq ($(ARCH), x86) +ARCH_DIR = arch/x86 +endif + +ifeq ($(ARCH), arm) +ARCH_DIR = arch/arm +MARCH = realview +DEFINES += -D__LINUX_ARM_ARCH__=6 +endif + +# contrib sources are in $(CONTRIB) +vpath %.c $(CONTRIB) +vpath %.S $(CONTRIB) + +PRIVATE_INCDIR += $(CONTRIB)/drivers/pci $(PKGDIR)/lib/src/arch/l4 \ + $(CONTRIB)/$(ARCH_DIR)/pci $(CONTRIB)/drivers/base/ \ + $(CONTRIB)/lib $(PKGDIR_OBJ) $(CONTRIB)/net/core $(CONTRIB)/fs + +################################################################## +# Sources for libdde_linux.a # +################################################################## +SRC_DDE = cli_sti.c fs.c hw-helpers.c init_task.c init.c pci.c power.c \ + process.c res.c sched.c signal.c smp.c softirq.c timer.c \ + page_alloc.c kmem_cache.c kmalloc.c irq.c param.c \ + vmalloc.c vmstat.c mm-helper.c + +# our implementation +SRC_C_libdde_linux26.o.a = $(addprefix arch/l4/, $(SRC_DDE)) + +ifeq ($(ARCH), x86) +SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/semaphore_32.S +SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/delay.o +SRC_C_libdde_linux26.o.a += lib/rwsem.c +SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/kernel/pci-dma.c +SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/kernel/pci-nommu.c +SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/kernel/setup.c +SRC_S_libdde_linux26_net.a += $(ARCH_DIR)/lib/checksum_32.S +endif + +ifeq ($(ARCH), arm) +SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/changebit.S +SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/clearbit.S +SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/div64.S +SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/findbit.S +SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/memzero.S +SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/setbit.S +SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/testclearbit.S +SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/testchangebit.S +SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/testsetbit.S +SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/kernel/semaphore.c +SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/kernel/traps.c +SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/mach-$(MARCH)/clock.c +SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/mach-$(MARCH)/realview_eb.c +SRC_C_libdde_linux26.o.a += lib/rwsem-spinlock.c +SRC_C_libdde_linux26.o.a += drivers/amba/bus.c +endif + +# + contrib stuff / slightly modified stuff +SRC_C_libdde_linux26.o.a += \ + kernel/dma.c \ + kernel/exit.c \ + kernel/kthread.c \ + kernel/mutex.c \ + kernel/notifier.c \ + kernel/resource.c \ + kernel/rwsem.c \ + kernel/sched.c \ + kernel/semaphore.c \ + kernel/sys.c \ + kernel/time.c \ + kernel/timer.c \ + kernel/wait.c \ + kernel/workqueue.c \ + kernel/irq/handle.c \ + lib/bitmap.c \ + lib/bitrev.c \ + lib/crc32.c \ + lib/ctype.c \ + lib/cpumask.c \ + lib/find_next_bit.c \ + lib/hexdump.c \ + lib/idr.c \ + lib/iomap.c \ + lib/hweight.c \ + lib/kasprintf.c \ + lib/kernel_lock.c \ + lib/klist.c \ + lib/kobject.c \ + lib/kref.c \ + lib/parser.c \ + lib/proportions.c \ + lib/radix-tree.c \ + lib/scatterlist.c \ + lib/sha1.c \ + lib/string.c \ + lib/vsprintf.c \ + lib/rbtree.c \ + init/calibrate.c \ + mm/dmapool.c \ + mm/mempool.c \ + mm/swap.c \ + mm/util.c \ + drivers/base/attribute_container.c \ + drivers/base/bus.c \ + drivers/base/class.c \ + drivers/base/core.c \ + drivers/base/cpu.c \ + drivers/base/dd.c \ + drivers/base/devres.c \ + drivers/base/driver.c \ + drivers/base/init.c \ + drivers/base/map.c \ + drivers/base/platform.c \ + drivers/base/sys.c \ + drivers/pci/access.c \ + drivers/pci/bus.c \ + drivers/pci/hotplug-pci.c \ + drivers/pci/pci.c \ + drivers/pci/pci-driver.c \ + drivers/pci/probe.c \ + drivers/pci/search.c \ + drivers/pci/setup-bus.c \ + drivers/pci/setup-res.c \ + drivers/char/random.c + +################################################################## +# Sources for libdde_linux_net.a # +################################################################## +SRC_C_libdde_linux26_net.a += \ + arch/l4/net.c \ + mach_glue/net.c \ + drivers/net/mii.c \ + net/core/dev.c \ + net/core/dev_mcast.c \ + net/core/ethtool.c \ + net/core/link_watch.c \ + net/core/neighbour.c \ + net/core/netevent.c \ + net/core/net-sysfs.c \ + net/core/net_namespace.c \ + net/core/rtnetlink.c \ + net/core/skbuff.c \ + net/core/skb_dma_map.c \ + net/core/utils.c \ + net/ethernet/eth.c \ + net/sched/sch_generic.c + +################################################################## +# Sources for libdde_linux_sound.a # +################################################################## +SRC_C_libdde_linux26_sound.a += \ + sound/sound_core.c \ + arch/l4/sound.c + +################################################################## +# Sources for libdde_linux_block.a # +################################################################## +# +SRC_C_libdde_linux26_block.a += \ + arch/l4/inodes.c \ + mach_glue/block.c \ + block/blk-barrier.c \ + block/blk-core.c \ + block/blk-exec.c \ + block/blk-ioc.c \ + block/blk-merge.c \ + block/blk-settings.c \ + block/blk-softirq.c \ + block/blk-sysfs.c \ + block/blk-tag.c \ + block/blk-timeout.c \ + block/elevator.c \ + block/genhd.c \ + block/noop-iosched.c \ + block/ioctl.c \ + block/scsi_ioctl.c \ + block/as-iosched.c \ + block/cfq-iosched.c \ + block/deadline-iosched.c \ + mm/backing-dev.c \ + mm/bounce.c \ + mm/page-writeback.c \ + fs/bio.c \ + fs/block_dev.c \ + fs/buffer.c \ + fs/filesystems.c +################################################################## +# Sources for libdde_linux_char.a # +################################################################## +SRC_C_libdde_linux26_char.a += \ + arch/l4/inodes.c \ + fs/char_dev.c + +all:: +lib/crc32.o : crc32table.h +lib/crc32.o : PRIVATE_INCDIR += . +kernel/time.o : timeconst.h +kernel/time.o : PRIVATE_INCDIR += . + +timeconst.h : $(SRC_DIR)/kernel/timeconst.pl + @$(GEN_MESSAGE) + $(VERBOSE)$< 250 >$@ + +crc32table.h : gen_crc32table + @$(GEN_MESSAGE) + $(VERBOSE)./$< >$@ + +gen_crc32table : lib/gen_crc32table.c + @$(GEN_MESSAGE) + $(VERBOSE)$(HOST_CC) -O2 -o $@ $< + +include $(PKGDIR)/Makeconf + +include $(L4DIR)/mk/lib.mk diff --git a/libdde-linux26/lib/src/arch/l4/cli_sti.c b/libdde-linux26/lib/src/arch/l4/cli_sti.c new file mode 100644 index 00000000..81864ebb --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/cli_sti.c @@ -0,0 +1,63 @@ +#include "local.h" + +#include <linux/kernel.h> + +/* IRQ lock reference counter */ +static __thread atomic_t _refcnt = ATOMIC_INIT(0); + +/* Check whether IRQs are currently disabled. + * + * This is the case, if flags is greater than 0. + */ + +int raw_irqs_disabled_flags(unsigned long flags) +{ + return ((int)flags > 0); +} + +/* Store the current flags state. + * + * This is done by returning the current refcnt. + */ +unsigned long __raw_local_save_flags(void) +{ + return (unsigned long)atomic_read(&_refcnt); +} + +/* Restore IRQ state. */ +void raw_local_irq_restore(unsigned long flags) +{ + atomic_set(&_refcnt, flags); +} + +/* Disable IRQs by grabbing the IRQ lock. */ +void raw_local_irq_disable(void) +{ + atomic_inc(&_refcnt); +} + +/* Unlock the IRQ lock until refcnt is 0. */ +void raw_local_irq_enable(void) +{ + atomic_set(&_refcnt, 0); +} + + +void raw_safe_halt(void) +{ + WARN_UNIMPL; +} + + +void halt(void) +{ + WARN_UNIMPL; +} + +/* These functions are empty for DDE. Every DDE thread is a separate + * "virtual" CPU. Therefore there is no need to en/disable bottom halves. + */ +void local_bh_disable(void) {} +void __local_bh_enable(void) {} +void _local_bh_enable(void) {} +void local_bh_enable(void) {} diff --git a/libdde-linux26/lib/src/arch/l4/fs.c b/libdde-linux26/lib/src/arch/l4/fs.c new file mode 100644 index 00000000..db452949 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/fs.c @@ -0,0 +1,111 @@ +#include "local.h" + +#include <linux/fs.h> +#include <linux/backing-dev.h> +#include <linux/mount.h> + +/* + * Some subsystems, such as the blockdev layer, implement their data + * hierarchy as a pseudo file system. To not incorporate the complete + * Linux VFS implementation, we cut this down to an own one only for + * pseudo file systems. + */ +static LIST_HEAD(dde_vfs_mounts); + +#define MAX_RA_PAGES 1 + +void default_unplug_io_fn(struct backing_dev_info *bdi, struct page* p) +{ +} + +struct backing_dev_info default_backing_dev_info = { + .ra_pages = MAX_RA_PAGES, + .state = 0, + .capabilities = BDI_CAP_MAP_COPY, + .unplug_io_fn = default_unplug_io_fn, +}; + +int seq_printf(struct seq_file *m, const char *f, ...) +{ + WARN_UNIMPL; + return 0; +} + +int generic_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + WARN_UNIMPL; + return 0; +} + + +/************************************** + * Filemap stuff * + **************************************/ +struct page * find_get_page(struct address_space *mapping, unsigned long offset) +{ + WARN_UNIMPL; + return NULL; +} + +void unlock_page(struct page *page) +{ + WARN_UNIMPL; +} + +int test_set_page_writeback(struct page *page) +{ + WARN_UNIMPL; + return 0; +} + +void end_page_writeback(struct page *page) +{ + WARN_UNIMPL; +} + +void do_invalidatepage(struct page *page, unsigned long offset) +{ + WARN_UNIMPL; +} + +int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) +{ + WARN_UNIMPL; + return 0; +} + +static struct vfsmount *dde_kern_mount(struct file_system_type *type, + int flags, const char *name, + void *data) +{ + struct list_head *pos, *head; + int error; + + head = &dde_vfs_mounts; + __list_for_each(pos, head) { + struct vfsmount *mnt = list_entry(pos, struct vfsmount, next); + if (strcmp(name, mnt->name) == 0) { + printk("FS type %s already mounted!?\n", name); + BUG(); + return NULL; + } + } + + struct vfsmount *m = kzalloc(sizeof(*m), GFP_KERNEL); + m->fs_type = type; + m->name = kmalloc(strlen(name) + 1, GFP_KERNEL); + memcpy(m->name, name, strlen(name) + 1); + + error = type->get_sb(type, flags, name, data, m); + BUG_ON(error); + + list_add_tail(&m->next, &dde_vfs_mounts); + + return m; +} + +struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) +{ + return dde_kern_mount(type, 0, type->name, NULL); +} diff --git a/libdde-linux26/lib/src/arch/l4/hw-helpers.c b/libdde-linux26/lib/src/arch/l4/hw-helpers.c new file mode 100644 index 00000000..555406c9 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/hw-helpers.c @@ -0,0 +1,12 @@ +#include "local.h" + +#include <linux/kexec.h> + +note_buf_t *crash_notes = NULL; + +void touch_nmi_watchdog(void) +{ + WARN_UNIMPL; +} + +unsigned long pci_mem_start = 0xABCDABCD; diff --git a/libdde-linux26/lib/src/arch/l4/init.c b/libdde-linux26/lib/src/arch/l4/init.c new file mode 100644 index 00000000..79112f78 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/init.c @@ -0,0 +1,33 @@ +#include "local.h" + +#include <dde26.h> +#include <dde.h> + +#define DEBUG_PCI(msg, ...) ddekit_printf( "\033[33m"msg"\033[0m\n", ##__VA_ARGS__) + +/* Didn't know where to put this. */ +unsigned long __per_cpu_offset[NR_CPUS]; + +extern void driver_init(void); +extern int classes_init(void); + +void __init __attribute__((used)) l4dde26_init(void) +{ + /* first, initialize DDEKit */ + ddekit_init(); + + l4dde26_kmalloc_init(); + + /* Init Linux driver framework before trying to add PCI devs to the bus */ + driver_init(); + + printk("Initialized DDELinux 2.6\n"); +} + +void l4dde26_do_initcalls(void) +{ + /* finally, let DDEKit perform all the initcalls */ + ddekit_do_initcalls(); +} + +dde_initcall(l4dde26_init); diff --git a/libdde-linux26/lib/src/arch/l4/init_task.c b/libdde-linux26/lib/src/arch/l4/init_task.c new file mode 100644 index 00000000..685373d1 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/init_task.c @@ -0,0 +1,131 @@ +#include "local.h" + +//#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> + +#include <linux/fs.h> +#include <linux/fdtable.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/ipc_namespace.h> +#include <linux/kernel.h> +#include <linux/mqueue.h> +#include <linux/module.h> +#include <linux/personality.h> + +/* init task */ +struct task_struct init_task; + +/* From kernel/pid.c */ +#define BITS_PER_PAGE (PAGE_SIZE*8) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) + +/* From init/main.c */ +enum system_states system_state; +EXPORT_SYMBOL(system_state); + +struct fs_struct init_fs = { + .count = ATOMIC_INIT(1), + .lock = __RW_LOCK_UNLOCKED(init_fs.lock), + .umask = 0022, +}; + +struct files_struct init_files = { + .count = ATOMIC_INIT(1), + .fdt = &init_files.fdtab, + .fdtab = { + .max_fds = NR_OPEN_DEFAULT, + .fd = &init_files.fd_array[0], + .close_on_exec = (fd_set *)&init_files.close_on_exec_init, + .open_fds = (fd_set *)&init_files.open_fds_init, + .rcu = RCU_HEAD_INIT, + }, + .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), +}; + +struct signal_struct init_signals = INIT_SIGNALS(init_signals); +struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); +struct mm_struct init_mm = INIT_MM(init_mm); +pgd_t swapper_pg_dir[PTRS_PER_PGD]; +union thread_union init_thread_union = { INIT_THREAD_INFO(init_task) }; +struct group_info init_groups = {.usage = ATOMIC_INIT(2)}; + +struct user_struct root_user = { + .__count = ATOMIC_INIT(1), + .processes = ATOMIC_INIT(1), + .files = ATOMIC_INIT(0), + .sigpending = ATOMIC_INIT(0), + .mq_bytes = 0, + .locked_shm = 0, +}; + +/* + * PID-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way a low pid_max + * value does not cause lots of bitmaps to be allocated, but + * the scheme scales to up to 4 million PIDs, runtime. + */ +struct pid_namespace init_pid_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .pidmap = { + [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } + }, + .last_pid = 0, + .level = 0, + .child_reaper = &init_task, +}; +EXPORT_SYMBOL_GPL(init_pid_ns); + +struct net init_net __attribute__((weak)); + +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); + +struct ipc_namespace init_ipc_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, +}; + +struct user_namespace init_user_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, +}; + + +struct uts_namespace init_uts_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .name = { + .sysname = "L4/DDE", + .nodename = "", + .release = "2.6", + .version = "25", + .machine = "", + .domainname = "", + }, +}; + +struct exec_domain default_exec_domain = { + .name = "Linux", /* name */ + .handler = NULL, /* no signaling! */ + .pers_low = 0, /* PER_LINUX personality. */ + .pers_high = 0, /* PER_LINUX personality. */ + .signal_map = 0, /* Identity map signals. */ + .signal_invmap = 0, /* - both ways. */ +}; + +/* copy of the initial task struct */ +struct task_struct init_task = INIT_TASK(init_task); +/* copy of the initial thread info (which contains init_task) */ +struct thread_info init_thread = INIT_THREAD_INFO(init_task); + +long do_no_restart_syscall(struct restart_block *param) +{ + return -EINTR; +} diff --git a/libdde-linux26/lib/src/arch/l4/inodes.c b/libdde-linux26/lib/src/arch/l4/inodes.c new file mode 100644 index 00000000..9ef02ed5 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/inodes.c @@ -0,0 +1,311 @@ +/** lib/src/arch/l4/inodes.c + * + * Assorted dummies implementing inode and superblock access functions, + * which are used by the block layer stuff, but not needed in DDE_Linux. + */ + +#include "local.h" + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/mount.h> + +/* + * Linux' global list of all super blocks. + */ +LIST_HEAD(super_blocks); + +/********************************** + * Inode stuff * + **********************************/ + +struct inode* new_inode(struct super_block *sb) +{ + if (sb->s_op->alloc_inode) + return sb->s_op->alloc_inode(sb); + + return kzalloc(sizeof(struct inode), GFP_KERNEL); +} + +void __mark_inode_dirty(struct inode *inode, int flags) +{ + WARN_UNIMPL; +} + +void iput(struct inode *inode) +{ + WARN_UNIMPL; +} + +void generic_delete_inode(struct inode *inode) +{ + WARN_UNIMPL; +} + +int invalidate_inodes(struct super_block * sb) +{ + WARN_UNIMPL; + return 0; +} + +void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +{ + WARN_UNIMPL; +} + +void touch_atime(struct vfsmount *mnt, struct dentry *dentry) +{ + WARN_UNIMPL; +} + +/********************************** + * Superblock stuff * + **********************************/ + +struct super_block * get_super(struct block_device *bdev) +{ + WARN_UNIMPL; + return NULL; +} + +int simple_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + WARN_UNIMPL; + return 0; +} + +void kill_anon_super(struct super_block *sb) +{ + WARN_UNIMPL; +} + +void shrink_dcache_sb(struct super_block * sb) +{ + WARN_UNIMPL; +} + +void drop_super(struct super_block *sb) +{ + WARN_UNIMPL; +} + +struct inode_operations empty_iops = { }; +struct file_operations empty_fops = { }; + +/**! Alloc and init a new inode. + * + * Basically stolen from linux/fs/inode.c:alloc_inode() + */ +static struct inode *dde_alloc_inode(struct super_block *sb) +{ + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = kzalloc(sizeof(*inode), GFP_KERNEL); + + if (inode) { + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_rdev = 0; + inode->dirtied_when = 0; + inode->i_private = NULL; + } + + return inode; +} + + +void __iget(struct inode *inode) +{ + atomic_inc(&inode->i_count); +} + + +static struct inode *dde_new_inode(struct super_block *sb, struct list_head *head, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct inode *ret = dde_alloc_inode(sb); + int err = 0; + + if (set) + err = set(ret, data); + + BUG_ON(err); + + __iget(ret); + ret->i_state = I_LOCK|I_NEW; + + list_add_tail(&ret->i_sb_list, &sb->s_inodes); + + return ret; +} + + +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct inode *inode = NULL; + struct list_head *p; + + list_for_each(p, &sb->s_inodes) { + struct inode *i = list_entry(p, struct inode, i_sb_list); + if (test) { + if (!test(i, data)) { + DEBUG_MSG("test false"); + continue; + } + else { + inode = i; + break; + } + } + } + + if (inode) + return inode; + + return dde_new_inode(sb, &sb->s_inodes, test, set, data); +} + +void unlock_new_inode(struct inode *inode) +{ + inode->i_state &= ~(I_LOCK | I_NEW); + wake_up_bit(&inode->i_state, __I_LOCK); +} + +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *, void*), + int (*set)(struct super_block *, void*), + void *data) +{ + struct super_block *s = NULL; + struct list_head *p; + int err; + + if (test) { + list_for_each(p, &type->fs_supers) { + struct super_block *block = list_entry(p, + struct super_block, + s_instances); + if (!test(block, data)) + continue; + return block; + } + } + + s = kzalloc(sizeof(*s), GFP_KERNEL); + BUG_ON(!s); + + INIT_LIST_HEAD(&s->s_dirty); + INIT_LIST_HEAD(&s->s_io); + INIT_LIST_HEAD(&s->s_files); + INIT_LIST_HEAD(&s->s_instances); + INIT_HLIST_HEAD(&s->s_anon); + INIT_LIST_HEAD(&s->s_inodes); + init_rwsem(&s->s_umount); + mutex_init(&s->s_lock); + lockdep_set_class(&s->s_umount, &type->s_umount_key); + /* + * The locking rules for s_lock are up to the + * filesystem. For example ext3fs has different + * lock ordering than usbfs: + */ + lockdep_set_class(&s->s_lock, &type->s_lock_key); + down_write(&s->s_umount); + s->s_count = S_BIAS; + atomic_set(&s->s_active, 1); + mutex_init(&s->s_vfs_rename_mutex); + mutex_init(&s->s_dquot.dqio_mutex); + mutex_init(&s->s_dquot.dqonoff_mutex); + init_rwsem(&s->s_dquot.dqptr_sem); + init_waitqueue_head(&s->s_wait_unfrozen); + s->s_maxbytes = MAX_NON_LFS; +#if 0 + s->dq_op = sb_dquot_ops; + s->s_qcop = sb_quotactl_ops; + s->s_op = &default_op; +#endif + s->s_time_gran = 1000000000; + + err = set(s, data); + BUG_ON(err); + + s->s_type = type; + strlcpy(s->s_id, type->name, sizeof(s->s_id)); + list_add_tail(&s->s_list, &super_blocks); + list_add(&s->s_instances, &type->fs_supers); + __module_get(type->owner); + return s; +} + +int set_anon_super(struct super_block *s, void *data) +{ + WARN_UNIMPL; + return 0; +} + +int get_sb_pseudo(struct file_system_type *fs_type, char *name, + const struct super_operations *ops, unsigned long magic, + struct vfsmount *mnt) +{ + struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); + struct super_operations default_ops = {}; + struct inode *root = NULL; + struct dentry *dentry = NULL; + struct qstr d_name = {.name = name, .len = strlen(name)}; + + BUG_ON(IS_ERR(s)); + + s->s_flags = MS_NOUSER; + s->s_maxbytes = ~0ULL; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = magic; + s->s_op = ops ? ops : &default_ops; + s->s_time_gran = 1; + root = new_inode(s); + + BUG_ON(!root); + + root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; + root->i_uid = root->i_gid = 0; +#if 0 + root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; + dentry = d_alloc(NULL, &d_name); + dentry->d_sb = s; + dentry->d_parent = dentry; + d_instantiate(dentry, root); +#endif + s->s_root = dentry; + s->s_flags |= MS_ACTIVE; + + mnt->mnt_sb = s; + mnt->mnt_root = dget(s->s_root); + + DEBUG_MSG("root mnt sb @ %p", mnt->mnt_sb); + + return 0; +} + +void inode_init_once(struct inode *inode) +{ + WARN_UNIMPL; +} + diff --git a/libdde-linux26/lib/src/arch/l4/irq.c b/libdde-linux26/lib/src/arch/l4/irq.c new file mode 100644 index 00000000..20bd57f0 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/irq.c @@ -0,0 +1,244 @@ +/* + * \brief Hardware-interrupt support + * \author Christian Helmuth <ch12@os.inf.tu-dresden.de> + * \date 2007-02-12 + * + * + * + * XXX Consider support for IRQ_HANDLED and friends (linux/irqreturn.h) + */ + +/* Linux */ +#include <linux/interrupt.h> +#include <linux/string.h> /* memset() */ + +/* local */ +#include "dde26.h" +#include "local.h" + +/* dummy */ +irq_cpustat_t irq_stat[CONFIG_NR_CPUS]; + +/** + * IRQ handling data + */ +static struct dde_irq +{ + unsigned irq; /* IRQ number */ + unsigned count; /* usage count */ + int shared; /* shared IRQ */ + struct ddekit_thread *thread; /* DDEKit interrupt thread */ + struct irqaction *action; /* Linux IRQ action */ + + struct dde_irq *next; /* next DDE IRQ */ +} *used_irqs; + + +static void irq_thread_init(void *p) { + l4dde26_process_add_worker(); } + + +extern ddekit_sem_t *dde_softirq_sem; +static void irq_handler(void *arg) +{ + struct dde_irq *irq = arg; + struct irqaction *action; + +#if 0 + DEBUG_MSG("irq 0x%x", irq->irq); +#endif + /* interrupt occurred - call all handlers */ + for (action = irq->action; action; action = action->next) { + irqreturn_t r = action->handler(action->irq, action->dev_id); +#if 0 + DEBUG_MSG("return: %s", r == IRQ_HANDLED ? "IRQ_HANDLED" : r == IRQ_NONE ? "IRQ_NONE" : "??"); +#endif + } + + /* upon return we check for pending soft irqs */ + if (local_softirq_pending()) + ddekit_sem_up(dde_softirq_sem); +} + + +/***************************** + ** IRQ handler bookkeeping ** + *****************************/ + +/** + * Claim IRQ + * + * \return usage counter or negative error code + * + * FIXME list locking + * FIXME are there more races? + */ +static int claim_irq(struct irqaction *action) +{ + int shared = action->flags & IRQF_SHARED ? 1 : 0; + struct dde_irq *irq; + + /* check if IRQ already used */ + for (irq = used_irqs; irq; irq = irq->next) + if (irq->irq == action->irq) break; + + /* we have to setup IRQ handling */ + if (!irq) { + /* allocate and initalize new descriptor */ + irq = ddekit_simple_malloc(sizeof(*irq)); + if (!irq) return -ENOMEM; + memset(irq, 0, sizeof(*irq)); + + irq->irq = action->irq; + irq->shared = shared; + irq->next = used_irqs; + used_irqs = irq; + + /* attach to interrupt */ + irq->thread = ddekit_interrupt_attach(irq->irq, + irq->shared, + irq_thread_init, + irq_handler, + (void *)irq); + if (!irq->thread) { + used_irqs = irq->next; + ddekit_simple_free(irq); + return -EBUSY; + } + } + + /* does desciptor allow our new handler? */ + if ((!irq->shared || !shared) && irq->action) return -EBUSY; + + /* add handler */ + irq->count++; + action->next = irq->action; + irq->action = action; + + return irq->count; +} + + +/** + * Free previously claimed IRQ + * + * \return usage counter or negative error code + */ +static struct irqaction *release_irq(unsigned irq_num, void *dev_id) +{ + struct dde_irq *prev_irq, *irq; + + /* check if IRQ already used */ + for (prev_irq = 0, irq = used_irqs; irq; + prev_irq = irq, irq = irq->next) + if (irq->irq == irq_num) break; + + if (!irq) return 0; + + struct irqaction *prev_action, *action; + + for (prev_action = 0, action = irq->action; action; + prev_action = action, action = action->next) + if (action->dev_id == dev_id) break; + + if (!action) return 0; + + /* dequeue action from irq */ + if (prev_action) + prev_action->next = action->next; + else + irq->action = action->next; + + /* dequeue irq from used_irqs list and free structure, + if no more actions available */ + if (!irq->action) { + if (prev_irq) + prev_irq->next = irq->next; + else + used_irqs = irq->next; + + /* detach from interrupt */ + ddekit_interrupt_detach(irq->irq); + + ddekit_simple_free(irq); + } + + return action; +} + + +/*************** + ** Linux API ** + ***************/ + +/** + * Request interrupt + * + * \param irq interrupt number + * \param handler interrupt handler -> top half + * \param flags interrupt handling flags (SA_SHIRQ, ...) + * \param dev_name device name + * \param dev_id cookie passed back to handler + * + * \return 0 on success; error code otherwise + * + * \todo FIXME consider locking! + */ +int request_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *dev_name, void *dev_id) +{ + if (!handler) return -EINVAL; + + /* facilitate Linux irqaction for this handler */ + struct irqaction *irq_action = ddekit_simple_malloc(sizeof(*irq_action)); + if (!irq_action) return -ENOMEM; + memset(irq_action, 0, sizeof(*irq_action)); + + irq_action->handler = handler; + irq_action->flags = flags; + irq_action->name = dev_name; + irq_action->dev_id = dev_id; + irq_action->irq = irq; + + /* attach to IRQ */ + int err = claim_irq(irq_action); + if (err < 0) return err; + + return 0; +} + +/** Release Interrupt + * \ingroup mod_irq + * + * \param irq interrupt number + * \param dev_id cookie passed back to handler + * + */ +void free_irq(unsigned int irq, void *dev_id) +{ + struct irqaction *irq_action = release_irq(irq, dev_id); + + if (irq_action) + ddekit_simple_free(irq_action); +} + +void disable_irq(unsigned int irq) +{ + ddekit_interrupt_disable(irq); +} + +void disable_irq_nosync(unsigned int irq) +{ + /* + * Note: + * In contrast to the _nosync semantics, DDEKit's + * disable definitely waits until a currently executed + * IRQ handler terminates. + */ + ddekit_interrupt_disable(irq); +} + +void enable_irq(unsigned int irq) +{ + ddekit_interrupt_enable(irq); +} diff --git a/libdde-linux26/lib/src/arch/l4/kmalloc.c b/libdde-linux26/lib/src/arch/l4/kmalloc.c new file mode 100644 index 00000000..816f443c --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/kmalloc.c @@ -0,0 +1,209 @@ +/* + * \brief kmalloc() implementation + * \author Christian Helmuth <ch12@os.inf.tu-dresden.de> + * \date 2007-01-24 + * + * In Linux 2.6 this resides in mm/slab.c. + * + * This implementation of kmalloc() stays with Linux's and uses kmem_caches for + * some power of two bytes. For larger allocations ddedkit_large_malloc() is + * used. This way, we optimize for speed and potentially waste memory + * resources. + */ + +/* Linux */ +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/mm.h> +#include <asm/io.h> + +#include "local.h" + +#include <dde26.h> + +/* dummy */ +int forbid_dac; + +/* This stuff is needed by some drivers, e.g. for ethtool. + * XXX: This is a fake, implement it if you really need ethtool stuff. + */ +struct page* mem_map = NULL; +static bootmem_data_t contig_bootmem_data; +struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; + +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return 0; +} +EXPORT_SYMBOL(remap_pfn_range); + +/******************* + ** Configuration ** + *******************/ + +#define DEBUG_MALLOC 0 + +/******************** + ** Implementation ** + ********************/ + +/* + * These are the default caches for kmalloc. Custom caches can have other sizes. + */ +static struct cache_sizes malloc_sizes[] = { +#define CACHE(x) { .cs_size = (x) }, +#include <linux/kmalloc_sizes.h> + CACHE(ULONG_MAX) +#undef CACHE +}; + + +/* + * kmalloc() cache names + */ +static const char *malloc_names[] = { +#define CACHE(x) "size-" #x, +#include <linux/kmalloc_sizes.h> + NULL +#undef CACHE +}; + + +/** + * Find kmalloc() cache for size + */ +static struct kmem_cache *find_cache(size_t size) +{ + struct cache_sizes *sizes; + + for (sizes = malloc_sizes; size > sizes->cs_size; ++sizes) ; + + return sizes->cs_cachep; +} + + +/** + * Free previously allocated memory + * @objp: pointer returned by kmalloc. + * + * If @objp is NULL, no operation is performed. + * + * Don't free memory not originally allocated by kmalloc() + * or you will run into trouble. + */ +void kfree(const void *objp) +{ + if (!objp) return; + + /* find cache back-pointer */ + void **p = (void **)objp - 1; + + ddekit_log(DEBUG_MALLOC, "objp=%p cache=%p (%d)", + p, *p, *p ? kmem_cache_size(*p) : 0); + + if (*p) + /* free from cache */ + kmem_cache_free(*p, p); + else + /* no cache for this size - use ddekit free */ + ddekit_large_free(p); +} + + +/** + * Allocate memory + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * kmalloc is the normal method of allocating memory + * in the kernel. + */ +void *__kmalloc(size_t size, gfp_t flags) +{ + /* add space for back-pointer */ + size += sizeof(void *); + + /* find appropriate cache */ + struct kmem_cache *cache = find_cache(size); + + void **p; + if (cache) { + /* allocate from cache */ + p = kmem_cache_alloc(cache, flags); + if (!p) { + printk("__kmalloc: kmem_cache_alloc %s fails\n", + ((char **)cache)[0]); + } + } + else { + /* no cache for this size - use ddekit malloc */ + p = ddekit_large_malloc(size); + if (flags & __GFP_ZERO) + memset (p, 0, size); + if (!p) { + printk("__kmalloc: ddekit_large_malloc %d fails\n", + size); + } + } + + ddekit_log(DEBUG_MALLOC, "size=%d, cache=%p (%d) => %p", + size, cache, cache ? kmem_cache_size(cache) : 0, p); + + /* return pointer to actual chunk */ + if (p) { + *p = cache; + p++; + } + return p; +} + + +size_t ksize(const void *p) +{ + struct kmem_cache *cache = (struct kmem_cache *)*((void**)p - 1); + if (cache) + return kmem_cache_size(cache); + return -1; +} + + +void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret = (void *)__get_free_pages(flag, get_order(size)); + + if (ret != NULL) { + memset(ret, 0, size); + *dma_handle = virt_to_bus(ret); + } + return ret; +} + + +void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + free_pages((unsigned long)vaddr, get_order(size)); +} + + +/******************** + ** Initialization ** + ********************/ + +/** + * dde_linux kmalloc initialization + */ +void l4dde26_kmalloc_init(void) +{ + struct cache_sizes *sizes = malloc_sizes; + const char **names = malloc_names; + + /* init malloc sizes array */ + for (; sizes->cs_size != ULONG_MAX; ++sizes, ++names) + sizes->cs_cachep = kmem_cache_create(*names, sizes->cs_size, 0, 0, 0); +} diff --git a/libdde-linux26/lib/src/arch/l4/kmem_cache.c b/libdde-linux26/lib/src/arch/l4/kmem_cache.c new file mode 100644 index 00000000..5e44c140 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/kmem_cache.c @@ -0,0 +1,211 @@ +/* + * \brief Kmem_cache implementation + * \author Christian Helmuth + * \date 2007-01-22 + * + * In Linux 2.6 this resides in mm/slab.c. + * + * I'll disregard the following function currently... + * + * extern struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags); + * extern void *kmem_cache_zalloc(struct kmem_cache *, gfp_t); + */ + +/* Linux */ +#include <linux/slab.h> + +#include "local.h" + + +/******************* + ** Configuration ** + *******************/ + +#define DEBUG_SLAB 0 + +#if DEBUG_SLAB +# define DEBUG_SLAB_ALLOC 1 +#else +# define DEBUG_SLAB_ALLOC 0 +#endif + +/* + * Kmem cache structure + */ +struct kmem_cache +{ + const char *name; /**< cache name */ + unsigned size; /**< obj size */ + + struct ddekit_slab *ddekit_slab_cache; /**< backing DDEKit cache */ + ddekit_lock_t cache_lock; /**< lock */ + void (*ctor)(void *); /**< object constructor */ +}; + + +/** + * Return size of objects in cache + */ +unsigned int kmem_cache_size(struct kmem_cache *cache) +{ + return cache->size; +} + + +/** + * Return name of cache + */ +const char *kmem_cache_name(struct kmem_cache *cache) +{ + return cache->name; +} + + +/** + * kmem_cache_shrink - Shrink a cache. + * @cachep: The cache to shrink. + * + * Releases as many slabs as possible for a cache. + * To help debugging, a zero exit status indicates all slabs were released. + */ +int kmem_cache_shrink(struct kmem_cache *cache) +{ + /* noop */ + return 1; +} + + +/** + * kmem_cache_free - Deallocate an object + * @cachep: The cache the allocation was from. + * @objp: The previously allocated object. + * + * Free an object which was previously allocated from this + * cache. + */ +void kmem_cache_free(struct kmem_cache *cache, void *objp) +{ + ddekit_log(DEBUG_SLAB_ALLOC, "\"%s\" (%p)", cache->name, objp); + + ddekit_lock_lock(&cache->cache_lock); + ddekit_slab_free(cache->ddekit_slab_cache, objp); + ddekit_lock_unlock(&cache->cache_lock); +} + + +/** + * kmem_cache_alloc - Allocate an object + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * + * Allocate an object from this cache. The flags are only relevant + * if the cache has no available objects. + */ +void *kmem_cache_alloc(struct kmem_cache *cache, gfp_t flags) +{ + void *ret; + + ddekit_log(DEBUG_SLAB_ALLOC, "\"%s\" flags=%x", cache->name, flags); + + ddekit_lock_lock(&cache->cache_lock); + ret = ddekit_slab_alloc(cache->ddekit_slab_cache); + ddekit_lock_unlock(&cache->cache_lock); + + // XXX: is it valid to run ctor AND memset to zero? + if (flags & __GFP_ZERO) + memset(ret, 0, cache->size); + else if (cache->ctor) + cache->ctor(ret); + + return ret; +} + + +/** + * kmem_cache_destroy - delete a cache + * @cachep: the cache to destroy + * + * Remove a struct kmem_cache object from the slab cache. + * Returns 0 on success. + * + * It is expected this function will be called by a module when it is + * unloaded. This will remove the cache completely, and avoid a duplicate + * cache being allocated each time a module is loaded and unloaded, if the + * module doesn't have persistent in-kernel storage across loads and unloads. + * + * The cache must be empty before calling this function. + * + * The caller must guarantee that noone will allocate memory from the cache + * during the kmem_cache_destroy(). + */ +void kmem_cache_destroy(struct kmem_cache *cache) +{ + ddekit_log(DEBUG_SLAB, "\"%s\"", cache->name); + + ddekit_slab_destroy(cache->ddekit_slab_cache); + ddekit_simple_free(cache); +} + + +/** + * kmem_cache_create - Create a cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @ctor: A constructor for the objects. + * + * Returns a ptr to the cache on success, NULL on failure. + * Cannot be called within a int, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache + * and the @dtor is run before the pages are handed back. + * + * @name must be valid until the cache is destroyed. This implies that + * the module calling this has to destroy the cache before getting unloaded. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + */ +struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, + void (*ctor)(void *)) +{ + ddekit_log(DEBUG_SLAB, "\"%s\" obj_size=%d", name, size); + + struct kmem_cache *cache; + + if (!name) { + printk("kmem_cache name reqeuired\n"); + return 0; + } + + cache = ddekit_simple_malloc(sizeof(*cache)); + if (!cache) { + printk("No memory for slab cache\n"); + return 0; + } + + /* Initialize a physically contiguous cache for kmem */ + if (!(cache->ddekit_slab_cache = ddekit_slab_init(size, 1))) { + printk("DDEKit slab init failed\n"); + ddekit_simple_free(cache); + return 0; + } + + cache->name = name; + cache->size = size; + cache->ctor = ctor; + + ddekit_lock_init_unlocked(&cache->cache_lock); + + return cache; +} diff --git a/libdde-linux26/lib/src/arch/l4/local.h b/libdde-linux26/lib/src/arch/l4/local.h new file mode 100644 index 00000000..d834a9db --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/local.h @@ -0,0 +1,99 @@ +#ifndef __DDE26_LOCAL_H +#define __DDE26_LOCAL_H + +#include <linux/sched.h> + +#include <ddekit/assert.h> +#include <ddekit/condvar.h> +#include <ddekit/debug.h> +#include <ddekit/initcall.h> +#include <ddekit/interrupt.h> +#include <ddekit/lock.h> +#include <ddekit/memory.h> +#include <ddekit/panic.h> +#include <ddekit/pci.h> +#include <ddekit/pgtab.h> +#include <ddekit/printf.h> +#include <ddekit/resources.h> +#include <ddekit/semaphore.h> +#include <ddekit/thread.h> +#include <ddekit/types.h> +#include <ddekit/timer.h> + +#include <dde26.h> + +#define DDE_DEBUG 1 +#define DDE_FERRET 0 + +/* Ferret Debugging stuff, note that this is the only point we are using + * L4 headers directly and only for debugging. */ +#if DDE_FERRET +#include <l4/ferret/maj_min.h> +#include <l4/ferret/client.h> +#include <l4/ferret/clock.h> +#include <l4/ferret/types.h> +#include <l4/ferret/sensors/list_producer.h> +#include <l4/ferret/sensors/list_producer_wrap.h> +extern ferret_list_local_t *ferret_ore_sensor; +#endif + +/*** + * Internal representation of a Linux kernel thread. This struct + * contains Linux' data as well as some additional data used by DDE. + */ +typedef struct dde26_thread_data +{ + /* NOTE: _threadinfo needs to be first in this struct! */ + struct thread_info _thread_info; ///< Linux thread info (see current()) + ddekit_thread_t *_ddekit_thread; ///< underlying DDEKit thread + ddekit_sem_t *_sleep_lock; ///< lock used for sleep_interruptible() + struct pid _vpid; ///< virtual PID +} dde26_thread_data; + +#define LX_THREAD(thread_data) ((thread_data)->_thread_info) +#define LX_TASK(thread_data) ((thread_data)->_thread_info.task) +#define DDEKIT_THREAD(thread_data) ((thread_data)->_ddekit_thread) +#define SLEEP_LOCK(thread_data) ((thread_data)->_sleep_lock) +#define VPID_P(thread_data) (&(thread_data)->_vpid) + +#if DDE_DEBUG +#define WARN_UNIMPL printk("unimplemented: %s\n", __FUNCTION__) +#define DEBUG_MSG(msg, ...) printk("%s: \033[36m"msg"\033[0m\n", __FUNCTION__, ##__VA_ARGS__) + +#define DECLARE_INITVAR(name) \ + static struct { \ + int _initialized; \ + char *name; \ + } init_##name = {0, #name,} + +#define INITIALIZE_INITVAR(name) init_##name._initialized = 1 + +#define CHECK_INITVAR(name) \ + if (init_##name._initialized == 0) { \ + printk("DDE26: \033[31;1mUsing uninitialized subsystem: "#name"\033[0m\n"); \ + BUG(); \ + } + +#else /* !DDE_DEBUG */ + +#define WARN_UNIMPL do {} while(0) +#define DEBUG_MSG(...) do {} while(0) +#define DECLARE_INITVAR(name) +#define CHECK_INITVAR(name) do {} while(0) +#define INITIALIZE_INITVAR(name) do {} while(0) + +#endif + +/* since _thread_info always comes first in the thread_data struct, + * we can derive the dde26_thread_data from a task struct by simply + * dereferencing its thread_info pointer + */ +static dde26_thread_data *lxtask_to_ddethread(struct task_struct *t) +{ + return (dde26_thread_data *)(task_thread_info(t)); +} + +extern struct thread_info init_thread; +extern struct task_struct init_task; + +#endif diff --git a/libdde-linux26/lib/src/arch/l4/mm-helper.c b/libdde-linux26/lib/src/arch/l4/mm-helper.c new file mode 100644 index 00000000..cc4cc1d1 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/mm-helper.c @@ -0,0 +1,40 @@ +/* Linux */ +#include <linux/gfp.h> +#include <linux/string.h> +#include <asm/page.h> + +#include "local.h" + +int ioprio_best(unsigned short aprio, unsigned short bprio) +{ + WARN_UNIMPL; + return 0; +} + +void *__alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal) +{ + WARN_UNIMPL; + return 0; +} + +/* + * Stolen from linux-2.6.29/fs/libfs.c + */ +ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, + const void *from, size_t available) +{ + loff_t pos = *ppos; + if (pos < 0) + return -EINVAL; + if (pos > available) + return 0; + if (count > available - pos) + count = available - pos; + memcpy(to, from + pos, count); + *ppos = pos + count; + + return count; +} + +int capable(int f) { return 1; } diff --git a/libdde-linux26/lib/src/arch/l4/net.c b/libdde-linux26/lib/src/arch/l4/net.c new file mode 100644 index 00000000..6e799119 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/net.c @@ -0,0 +1,36 @@ +/****************************************************************************** + * DDELinux networking utilities. * + * * + * Bjoern Doebel <doebel@tudos.org> * + * * + * (c) 2005 - 2007 Technische Universitaet Dresden * + * This file is part of DROPS, which is distributed under the terms of the * + * GNU General Public License 2. Please see the COPYING file for details. * + ******************************************************************************/ + +#include <dde26_net.h> + +#include <linux/kernel.h> +#include <linux/skbuff.h> + +#include "local.h" + + +/* Callback function to be called if a network packet arrives and needs to + * be handled by netif_rx() or netif_receive_skb() + */ +linux_rx_callback l4dde26_rx_callback = NULL; + + +/* Register a netif_rx callback function. + * + * \return pointer to old callback function + */ +linux_rx_callback l4dde26_register_rx_callback(linux_rx_callback cb) +{ + linux_rx_callback old = l4dde26_rx_callback; + l4dde26_rx_callback = cb; + DEBUG_MSG("New rx callback @ %p.", cb); + + return old; +} diff --git a/libdde-linux26/lib/src/arch/l4/page_alloc.c b/libdde-linux26/lib/src/arch/l4/page_alloc.c new file mode 100644 index 00000000..e887bd51 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/page_alloc.c @@ -0,0 +1,276 @@ +/* + * \brief Page allocation + * \author Christian Helmuth <ch12@tudos.org> + * Bjoern Doebel <doebel@tudos.org> + * \date 2007-01-22 + * + * In Linux 2.6 this resides in mm/page_alloc.c. + * + * This implementation is far from complete as it does not cover "struct page" + * emulation. In Linux, there's an array of structures for all pages. In + * particular, iteration works for this array like: + * + * struct page *p = alloc_pages(3); // p refers to first page of allocation + * ++p; // p refers to second page + * + * There may be more things to cover and we should have a deep look into the + * kernel parts we want to reuse. Candidates for problems may be file systems, + * storage (USB, IDE), and video (bttv). + */ + +/* Linux */ +#include <linux/gfp.h> +#include <linux/string.h> +#include <linux/pagevec.h> +#include <linux/mm.h> +#include <asm/page.h> + +#include "local.h" + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; + +/******************* + ** Configuration ** + *******************/ + +#define DEBUG_PAGE_ALLOC 0 + + +/* + * DDE page cache + * + * We need to store all pages somewhere (which in the Linux kernel is + * performed by the huge VM infrastructure. Purpose for us is: + * - make virt_to_phys() work + * - enable external clients to hand in memory (e.g., a dm_phys + * dataspace and make it accessible as Linux pages to the DDE) + */ + +#define DDE_PAGE_CACHE_SHIFT 10 +#define DDE_PAGE_CACHE_SIZE (1 << DDE_PAGE_CACHE_SHIFT) +#define DDE_PAGE_CACHE_MASK (DDE_PAGE_CACHE_SIZE - 1) + +typedef struct +{ + struct hlist_node list; + struct page *page; +} page_cache_entry; + +static struct hlist_head dde_page_cache[DDE_PAGE_CACHE_SIZE]; + +/** Hash function to map virtual addresses to page cache buckets. */ +#define VIRT_TO_PAGEHASH(a) ((((unsigned long)a) >> PAGE_SHIFT) & DDE_PAGE_CACHE_MASK) + + +void dde_page_cache_add(struct page *p) +{ + unsigned int hashval = VIRT_TO_PAGEHASH(p->virtual); + + page_cache_entry *e = kmalloc(sizeof(page_cache_entry), GFP_KERNEL); + +#if DEBUG_PAGE_ALLOC + DEBUG_MSG("virt %p, hash: %x", p->virtual, hashval); +#endif + + e->page = p; + INIT_HLIST_NODE(&e->list); + + hlist_add_head(&e->list, &dde_page_cache[hashval]); +} + + +void dde_page_cache_remove(struct page *p) +{ + unsigned int hashval = VIRT_TO_PAGEHASH(p->virtual); + struct hlist_node *hn = NULL; + struct hlist_head *h = &dde_page_cache[hashval]; + page_cache_entry *e = NULL; + struct hlist_node *v = NULL; + + hlist_for_each_entry(e, hn, h, list) { + if ((unsigned long)e->page->virtual == ((unsigned long)p->virtual & PAGE_MASK)) + v = hn; + break; + } + + if (v) { +#if DEBUG_PAGE_ALLOC + DEBUG_MSG("deleting node %p which contained page %p", v, p); +#endif + hlist_del(v); + } +} + + +struct page* dde_page_lookup(unsigned long va) +{ + unsigned int hashval = VIRT_TO_PAGEHASH(va); + + struct hlist_node *hn = NULL; + struct hlist_head *h = &dde_page_cache[hashval]; + page_cache_entry *e = NULL; + + hlist_for_each_entry(e, hn, h, list) { + if ((unsigned long)e->page->virtual == (va & PAGE_MASK)) + return e->page; + } + + return NULL; +} + + +struct page * __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nm) +{ + /* XXX: In fact, according to order, we should have one struct page + * for every page, not only for the first one. + */ + struct page *ret = kmalloc(sizeof(*ret), GFP_KERNEL); + + ret->virtual = (void *)__get_free_pages(gfp_mask, order); + dde_page_cache_add(ret); + + return ret; +} + + +unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) +{ + ddekit_log(DEBUG_PAGE_ALLOC, "gfp_mask=%x order=%d (%d bytes)", + gfp_mask, order, PAGE_SIZE << order); + + Assert(gfp_mask != GFP_DMA); + void *p = ddekit_large_malloc(PAGE_SIZE << order); + + return (unsigned long)p; +} + + +unsigned long get_zeroed_page(gfp_t gfp_mask) +{ + unsigned long p = __get_free_pages(gfp_mask, 0); + + if (p) memset((void *)p, 0, PAGE_SIZE); + + return (unsigned long)p; +} + + +void free_hot_page(struct page *page) +{ + WARN_UNIMPL; +} + +/* + * XXX: If alloc_pages() gets fixed to allocate a page struct per page, + * this needs to be adapted, too. + */ +void __free_pages(struct page *page, unsigned int order) +{ + free_pages((unsigned long)page->virtual, order); + dde_page_cache_remove(page); +} + +void __pagevec_free(struct pagevec *pvec) +{ + WARN_UNIMPL; +} + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + WARN_UNIMPL; + return 0; +} + +/** + * ... + * + * XXX order may be larger than allocation at 'addr' - it may comprise several + * allocation via __get_free_pages()! + */ +void free_pages(unsigned long addr, unsigned int order) +{ + ddekit_log(DEBUG_PAGE_ALLOC, "addr=%p order=%d", (void *)addr, order); + + ddekit_large_free((void *)addr); +} + + +unsigned long __pa(volatile void *addr) +{ + return ddekit_pgtab_get_physaddr((void*)addr); +} + +void *__va(unsigned long addr) +{ + return (void*)ddekit_pgtab_get_virtaddr((ddekit_addr_t) addr); +} + + +int set_page_dirty_lock(struct page *page) +{ + WARN_UNIMPL; + return 0; +} + + +/* + * basically copied from linux/mm/page_alloc.c + */ +void *__init alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long limit) +{ + void * table = NULL; + unsigned long log2qty; + unsigned long size; + + if (numentries == 0) + numentries = 1024; + + log2qty = ilog2(numentries); + size = bucketsize << log2qty; + + do { + unsigned long order; + for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++); + table = (void*) __get_free_pages(GFP_ATOMIC, order); + } while (!table && size > PAGE_SIZE && --log2qty); + + if (!table) + panic("Failed to allocate %s hash table\n", tablename); + + printk("%s hash table entries: %d (order: %d, %lu bytes)\n", + tablename, + (1U << log2qty), + ilog2(size) - PAGE_SHIFT, + size); + + if (_hash_shift) + *_hash_shift = log2qty; + if (_hash_mask) + *_hash_mask = (1 << log2qty) - 1; + + return table; +} + + +static void __init dde_page_cache_init(void) +{ + printk("Initializing DDE page cache\n"); + int i=0; + + for (i; i < DDE_PAGE_CACHE_SIZE; ++i) + INIT_HLIST_HEAD(&dde_page_cache[i]); +} + +core_initcall(dde_page_cache_init); diff --git a/libdde-linux26/lib/src/arch/l4/param.c b/libdde-linux26/lib/src/arch/l4/param.c new file mode 100644 index 00000000..c459428a --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/param.c @@ -0,0 +1,95 @@ +#include <linux/moduleparam.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/slab.h> + +/* Lazy bastard, eh? */ +#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ + int param_set_##name(const char *val, struct kernel_param *kp) \ + { \ + return 0; \ + } \ + int param_get_##name(char *buffer, struct kernel_param *kp) \ + { \ + return 0;\ + } + +STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul); +STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul); +STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul); +STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul); + +int param_set_charp(const char *val, struct kernel_param *kp) +{ + if (!val) { + printk(KERN_ERR "%s: string parameter expected\n", + kp->name); + return -EINVAL; + } + + if (strlen(val) > 1024) { + printk(KERN_ERR "%s: string parameter too long\n", + kp->name); + return -ENOSPC; + } + + *(char **)kp->arg = (char *)val; + return 0; +} + +int param_get_charp(char *buffer, struct kernel_param *kp) +{ + return sprintf(buffer, "%s", *((char **)kp->arg)); +} + +int param_set_bool(const char *val, struct kernel_param *kp) +{ + /* No equals means "set"... */ + if (!val) val = "1"; + + /* One of =[yYnN01] */ + switch (val[0]) { + case 'y': case 'Y': case '1': + *(int *)kp->arg = 1; + return 0; + case 'n': case 'N': case '0': + *(int *)kp->arg = 0; + return 0; + } + return -EINVAL; +} + +int param_get_bool(char *buffer, struct kernel_param *kp) +{ + /* Y and N chosen as being relatively non-coder friendly */ + return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); +} + +int param_set_invbool(const char *val, struct kernel_param *kp) +{ + int boolval, ret; + struct kernel_param dummy; + + dummy.arg = &boolval; + ret = param_set_bool(val, &dummy); + if (ret == 0) + *(int *)kp->arg = !boolval; + return ret; +} + +int param_get_invbool(char *buffer, struct kernel_param *kp) +{ + return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y'); +} + +int printk_ratelimit(void) +{ + return 0; +} diff --git a/libdde-linux26/lib/src/arch/l4/pci.c b/libdde-linux26/lib/src/arch/l4/pci.c new file mode 100644 index 00000000..b50a7353 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/pci.c @@ -0,0 +1,205 @@ +#include "local.h" + +#include <linux/delay.h> +#include <linux/pci.h> +#include <linux/list.h> +#include <linux/init.h> + +/* will include $(CONTRIB)/drivers/pci/pci.h */ +#include "pci.h" + +DECLARE_INITVAR(dde26_pci); + +/** PCI device descriptor */ +typedef struct l4dde_pci_dev { + struct list_head next; /**< chain info */ + struct ddekit_pci_dev *ddekit_dev; /**< corresponding DDEKit descriptor */ + struct pci_dev *linux_dev; /**< Linux descriptor */ +} l4dde_pci_dev_t; + + +/******************************************************************************************* + ** PCI data ** + *******************************************************************************************/ +/** List of Linux-DDEKit PCIDev mappings */ +static LIST_HEAD(pcidev_mappings); + +static int l4dde26_pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val); +static int l4dde26_pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val); + +/** PCI operations for our virtual PCI bus */ +static struct pci_ops dde_pcibus_ops = { + .read = l4dde26_pci_read, + .write = l4dde26_pci_write, +}; + + +/******************************************************************************************* + ** Read/write PCI config space. This is simply mapped to the DDEKit functions. ** + *******************************************************************************************/ +static int l4dde26_pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) +{ + return ddekit_pci_read(bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val); +} + +static int l4dde26_pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) +{ + return ddekit_pci_write(bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val); +} + +int pci_irq_enable(struct pci_dev *dev) +{ + int irq = dev->irq; + int pin = 0; + int ret; + + DEBUG_MSG("dev %p", dev); + if (!dev) + return -EINVAL; + + pin = (int)dev->pin; + DEBUG_MSG("irq %d, pin %d", dev->irq, dev->pin); + if (!pin) { + dev_warn(&dev->dev, + "No interrupt pin configured for device %s\n", + pci_name(dev)); + return 0; + } + pin--; + + ret = ddekit_pci_irq_enable(dev->bus->number, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn), pin, &irq); + if (ret) { + dev_warn(&dev->dev, "Interrupt enable failed for device %s (%d)\n", + pci_name(dev), ret); + return -1; + } + + dev_info(&dev->dev, "PCI INT %c -> GSI %d -> IRQ %d\n", + 'A' + pin, irq, dev->irq); + + dev->irq = irq; + return 0; +} + +int __pci_enable_device(struct pci_dev *dev) +{ + WARN_UNIMPL; + return 0; +} + + +/** + * pci_enable_device - Initialize device before it's used by a driver. + * + * Initialize device before it's used by a driver. Ask low-level code + * to enable I/O and memory. Wake up the device if it was suspended. + * Beware, this function can fail. + * + * \param dev PCI device to be initialized + * + */ +int +pci_enable_device(struct pci_dev *dev) +{ + CHECK_INITVAR(dde26_pci); +// WARN_UNIMPL; + return pci_irq_enable(dev); +} + + +/** + * pci_disable_device - Disable PCI device after use + * + * Signal to the system that the PCI device is not in use by the system + * anymore. This only involves disabling PCI bus-mastering, if active. + * + * \param dev PCI device to be disabled + */ +void pci_disable_device(struct pci_dev *dev) +{ + CHECK_INITVAR(dde26_pci); + WARN_UNIMPL; +} + + +void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev) +{ + //WARN_UNIMPL; +} + +static unsigned int pcibios_max_latency = 255; + +void pcibios_set_master(struct pci_dev *dev) +{ + u8 lat; + pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat); + if (lat < 16) + lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency; + else if (lat > pcibios_max_latency) + lat = pcibios_max_latency; + else + return; + pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); +} + +int pci_create_sysfs_dev_files(struct pci_dev *pdev) +{ + return 0; +} + +unsigned int pcibios_assign_all_busses(void) +{ + return 1; +} + +void +pcibios_align_resource(void *data, struct resource *res, + resource_size_t size, resource_size_t align) +{ + WARN_UNIMPL; +} + +int pcibios_enable_device(struct pci_dev *dev, int mask) +{ +#if 0 + int err; + + if ((err = pcibios_enable_resources(dev, mask)) < 0) + return err; + + return pcibios_enable_irq(dev); +#endif + WARN_UNIMPL; + return -1; +} + +/******************************************************************************************* + ** Initialization function ** + *******************************************************************************************/ + +/** Initialize DDELinux PCI subsystem. + */ +void __init l4dde26_init_pci(void) +{ + ddekit_pci_init(); + int nr; + char found[256] = { }; + int bus, slot, func; + + for (nr = 0; ; nr++) { + if (ddekit_pci_get_device(nr, &bus, &slot, &func) != 0) + break; + if (!found[bus]) { + struct pci_bus *pci_bus = pci_create_bus(NULL, bus, &dde_pcibus_ops, NULL); + Assert(pci_bus); + pci_do_scan_bus(pci_bus); + + found[bus] = 1; + } + } + + INITIALIZE_INITVAR(dde26_pci); +} + +arch_initcall(l4dde26_init_pci); diff --git a/libdde-linux26/lib/src/arch/l4/power.c b/libdde-linux26/lib/src/arch/l4/power.c new file mode 100644 index 00000000..e36487bd --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/power.c @@ -0,0 +1,23 @@ +/* Dummy functions for power management. */ + +#include "local.h" +#include <linux/device.h> + +int device_pm_add(struct device * dev) +{ + WARN_UNIMPL; + return 0; +} + + +void device_pm_remove(struct device * dev) +{ + WARN_UNIMPL; +} + +int pm_qos_add_requirement(int qos, char *name, s32 value) { return 0; } +int pm_qos_update_requirement(int qos, char *name, s32 new_value) { return 0; } +void pm_qos_remove_requirement(int qos, char *name) { } +int pm_qos_requirement(int qos) { return 0; } +int pm_qos_add_notifier(int qos, struct notifier_block *notifier) { return 0; } +int pm_qos_remove_notifier(int qos, struct notifier_block *notifier) { return 0; } diff --git a/libdde-linux26/lib/src/arch/l4/process.c b/libdde-linux26/lib/src/arch/l4/process.c new file mode 100644 index 00000000..ac700f82 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/process.c @@ -0,0 +1,343 @@ +#include <dde.h> +#include <dde26.h> + +#include <asm/atomic.h> + +#include <linux/init_task.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/thread_info.h> +#include <linux/sched.h> +#include <linux/pid.h> +#include <linux/vmalloc.h> + +#include "local.h" + +/***************************************************************************** + ** Current() implementation ** + *****************************************************************************/ +struct thread_info *current_thread_info(void) +{ + dde26_thread_data *cur = (dde26_thread_data *)ddekit_thread_get_my_data(); + return &LX_THREAD(cur); +} + +struct task_struct *get_current(void) +{ + return current_thread_info()->task; +} + +/***************************************************************************** + ** PID-related stuff ** + ** ** + ** Linux manages lists of PIDs that are handed out to processes so that at ** + ** a later point it is able to determine which task_struct belongs to a ** + ** certain PID. We implement this with a single list holding the mappings ** + ** for all our threads. ** + *****************************************************************************/ + +LIST_HEAD(_pid_task_list); +ddekit_lock_t _pid_task_list_lock; + +/** PID to task_struct mapping */ +struct pid2task +{ + struct list_head list; /**< list data */ + struct pid *pid; /**< PID */ + struct task_struct *ts; /**< task struct */ +}; + +struct pid init_struct_pid = INIT_STRUCT_PID; + +void put_pid(struct pid *pid) +{ + if (pid) + atomic_dec(&pid->count); + // no freeing here, our struct pid's are always allocated as + // part of the dde26_thread_data +} + +/** Attach PID to a certain task struct. */ +void attach_pid(struct task_struct *task, enum pid_type type + __attribute__((unused)), struct pid *pid) +{ + /* Initialize a new pid2task mapping */ + struct pid2task *pt = kmalloc(sizeof(struct pid2task), GFP_KERNEL); + pt->pid = get_pid(pid); + pt->ts = task; + + /* add to list */ + ddekit_lock_lock(&_pid_task_list_lock); + list_add(&pt->list, &_pid_task_list); + ddekit_lock_unlock(&_pid_task_list_lock); +} + +/** Detach PID from a task struct. */ +void detach_pid(struct task_struct *task, enum pid_type type __attribute__((unused))) +{ + struct list_head *p, *n, *h; + + h = &_pid_task_list; + + ddekit_lock_lock(&_pid_task_list_lock); + /* search for mapping with given task struct and free it if necessary */ + list_for_each_safe(p, n, h) { + struct pid2task *pt = list_entry(p, struct pid2task, list); + if (pt->ts == task) { + put_pid(pt->pid); + list_del(p); + kfree(pt); + break; + } + } + ddekit_lock_unlock(&_pid_task_list_lock); +} + +struct task_struct *find_task_by_pid_type(int type, int nr) +{ + struct list_head *h, *p; + h = &_pid_task_list; + + ddekit_lock_lock(&_pid_task_list_lock); + list_for_each(p, h) { + struct pid2task *pt = list_entry(p, struct pid2task, list); + if (pid_nr(pt->pid) == nr) { + ddekit_lock_unlock(&_pid_task_list_lock); + return pt->ts; + } + } + ddekit_lock_unlock(&_pid_task_list_lock); + + return NULL; +} + + +struct task_struct *find_task_by_pid_ns(int nr, struct pid_namespace *ns) +{ + /* we don't implement PID name spaces */ + return find_task_by_pid_type(0, nr); +} + +struct task_struct *find_task_by_pid(int nr) +{ + return find_task_by_pid_type(0, nr); +} + +/***************************************************************************** + ** kernel_thread() implementation ** + *****************************************************************************/ +/* Struct containing thread data for a newly created kthread. */ +struct __kthread_data +{ + int (*fn)(void *); + void *arg; + ddekit_lock_t lock; + dde26_thread_data *kthread; +}; + +/** Counter for running kthreads. It is used to create unique names + * for kthreads. + */ +static atomic_t kthread_count = ATOMIC_INIT(0); + +/** Entry point for new kernel threads. Make this thread a DDE26 + * worker and then execute the real thread fn. + */ +static void __kthread_helper(void *arg) +{ + struct __kthread_data *k = (struct __kthread_data *)arg; + + /* + * Make a copy of the fn and arg pointers, as the kthread struct is + * deleted by our parent after notifying it and this may happen before we + * get to execute the function. + */ + int (*_fn)(void*) = k->fn; + void *_arg = k->arg; + + l4dde26_process_add_worker(); + + /* + * Handshake with creator - we store our thread data in the + * kthread struct and then unlock the lock to notify our + * creator about completing setup + */ + k->kthread = (dde26_thread_data *)ddekit_thread_get_my_data(); + ddekit_lock_unlock(&k->lock); + + do_exit(_fn(_arg)); +} + +/** Our implementation of Linux' kernel_thread() function. Setup a new + * thread running our __kthread_helper() function. + */ +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + ddekit_thread_t *t; + char name[20]; + struct __kthread_data *kt = vmalloc(sizeof(struct __kthread_data)); + ddekit_lock_t lock; + + /* Initialize (and grab) handshake lock */ + ddekit_lock_init(&lock); + ddekit_lock_lock(&lock); + + int threadnum = atomic_inc_return(&kthread_count); + kt->fn = fn; + kt->arg = arg; + kt->lock = lock; // Copy lock ptr, note that kt is freed by the + // new thread, so we MUST NOT use kt->lock after + // this point! + + snprintf(name, 20, ".kthread%x", threadnum); + t = ddekit_thread_create(__kthread_helper, + (void *)kt, name); + Assert(t); + + ddekit_lock_lock(&lock); + ddekit_lock_deinit(&lock); + + return pid_nr(VPID_P(kt->kthread)); +} + +/** Our implementation of exit(). For DDE purposes this only relates + * to kernel threads. + */ +void do_exit(long code) +{ + ddekit_thread_t *t = DDEKIT_THREAD(lxtask_to_ddethread(current)); +// printk("Thread %s exits with code %x\n", ddekit_thread_get_name(t), code); + + /* do some cleanup */ + detach_pid(current, 0); + + /* goodbye, cruel world... */ + ddekit_thread_exit(); +} + +/***************************************************************************** + ** Misc functions ** + *****************************************************************************/ + + +char *get_task_comm(char *buf, struct task_struct *tsk) +{ + char *ret; + /* buf must be at least sizeof(tsk->comm) in size */ + task_lock(tsk); + ret = strncpy(buf, tsk->comm, sizeof(tsk->comm)); + task_unlock(tsk); + return ret; +} + + +void set_task_comm(struct task_struct *tsk, char *buf) +{ + task_lock(tsk); + strlcpy(tsk->comm, buf, sizeof(tsk->comm)); + task_unlock(tsk); +} + + +/***************************************************************************** + ** DDEKit gluecode, init functions ** + *****************************************************************************/ +/* Initialize a dde26 thread. + * + * - Allocate thread data, as well as a Linux task struct, + * - Fill in default values for thread_info, and task, + * - Adapt task struct's thread_info backreference + * - Initialize the DDE sleep lock + */ +static dde26_thread_data *init_dde26_thread(void) +{ + /* + * Virtual PID counter + */ + static atomic_t pid_counter = ATOMIC_INIT(0); + dde26_thread_data *t = vmalloc(sizeof(dde26_thread_data)); + Assert(t); + + memcpy(&t->_vpid, &init_struct_pid, sizeof(struct pid)); + t->_vpid.numbers[0].nr = atomic_inc_return(&pid_counter); + + memcpy(&LX_THREAD(t), &init_thread, sizeof(struct thread_info)); + + LX_TASK(t) = vmalloc(sizeof(struct task_struct)); + Assert(LX_TASK(t)); + + memcpy(LX_TASK(t), &init_task, sizeof(struct task_struct)); + + /* nice: Linux backreferences a task`s thread_info from the + * task struct (which in turn can be found using the + * thread_info...) */ + LX_TASK(t)->stack = &LX_THREAD(t); + + /* initialize this thread's sleep lock */ + SLEEP_LOCK(t) = ddekit_sem_init(0); + + return t; +} + +/* Process setup for worker threads */ +int l4dde26_process_add_worker(void) +{ + dde26_thread_data *cur = init_dde26_thread(); + + /* If this function is called for a kernel_thread, the thread already has + * been set up and we just need to store a reference to the ddekit struct. + * However, this function may also be called directly to turn an L4 thread + * into a DDE thread. Then, we need to initialize here. */ + cur->_ddekit_thread = ddekit_thread_myself(); + if (cur->_ddekit_thread == NULL) + cur->_ddekit_thread = ddekit_thread_setup_myself(".dde26_thread"); + Assert(cur->_ddekit_thread); + + ddekit_thread_set_my_data(cur); + + attach_pid(LX_TASK(cur), 0, &cur->_vpid); + + /* Linux' default is to have this set to 1 initially and let the + * scheduler set this to 0 later on. + */ + current_thread_info()->preempt_count = 0; + + return 0; +} + + +/** + * Add an already existing DDEKit thread to the set of threads known to the + * Linux environment. This is used for the timer thread, which is actually a + * DDEKit thread, but Linux code shall see it as a Linux thread as well. + */ +int l4dde26_process_from_ddekit(ddekit_thread_t *t) +{ + Assert(t); + + dde26_thread_data *cur = init_dde26_thread(); + cur->_ddekit_thread = t; + ddekit_thread_set_data(t, cur); + attach_pid(LX_TASK(cur), 0, &cur->_vpid); + + return 0; +} + +/** Function to initialize the first DDE process. + */ +int __init l4dde26_process_init(void) +{ + ddekit_lock_init_unlocked(&_pid_task_list_lock); + + int kthreadd_pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); + kthreadd_task = find_task_by_pid(kthreadd_pid); + + l4dde26_process_add_worker(); + + return 0; +} + +DEFINE_PER_CPU(int, cpu_number); + +//dde_process_initcall(l4dde26_process_init); diff --git a/libdde-linux26/lib/src/arch/l4/res.c b/libdde-linux26/lib/src/arch/l4/res.c new file mode 100644 index 00000000..a2ffb98f --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/res.c @@ -0,0 +1,188 @@ +#include "local.h" + +#include <linux/ioport.h> + +/** Request an IO port region. + * + * \param start start port + * \param n number of ports + * \param name name of allocator (unused) + * + * \return NULL error + * \return !=NULL success + * + * \bug Since no one in Linux uses this function's return value, + * we do not allocate and fill a resource struct. + */ +static struct resource *l4dde26_request_region(resource_size_t start, + resource_size_t n, + const char *name) +{ + int err = ddekit_request_io(start, n); + + if (err) + return NULL; + + return (struct resource *)1; +} + + +/** List of memory regions that have been requested. This is used to + * perform ioremap() and iounmap() + */ +static LIST_HEAD(dde_mem_regions); + +/** va->pa mapping used to store memory regions */ +struct dde_mem_region { + ddekit_addr_t pa; + ddekit_addr_t va; + unsigned int size; + struct list_head list; +}; + +void __iomem * ioremap(unsigned long phys_addr, unsigned long size); + +/** Request an IO memory region. + * + * \param start start address + * \param n size of memory area + * \param name name of allocator (unused) + * + * \return NULL error + * \return !=NULL success + * + * \bug Since no one in Linux uses this function's return value, + * we do not allocate and fill a resource struct. + */ +static struct resource *l4dde26_request_mem_region(resource_size_t start, + resource_size_t n, + const char *name) +{ + ddekit_addr_t va = 0; + struct dde_mem_region *mreg; + + // do not a resource request twice + if (ioremap(start, n)) + return (struct resource *)1; + + int i = ddekit_request_mem(start, n, &va); + + if (i) { + ddekit_printf("request_mem_region() failed (start %lx, size %x)", start, n); + return NULL; + } + + mreg = kmalloc(sizeof(struct dde_mem_region), GFP_KERNEL); + Assert(mreg); + + mreg->pa = start; + mreg->va = va; + mreg->size = n; + list_add(&mreg->list, &dde_mem_regions); + +#if 0 + ddekit_pgtab_set_region_with_size((void *)va, start, n, PTE_TYPE_OTHER); +#endif + + return (struct resource *)1; +} + + +struct resource * __request_region(struct resource *parent, + resource_size_t start, + resource_size_t n, + const char *name, int flags) +{ + Assert(parent); + Assert(parent->flags & IORESOURCE_IO || parent->flags & IORESOURCE_MEM); + + switch (parent->flags) + { + case IORESOURCE_IO: + printk("IO: name: %s, start: %x, len: %d\n", + name, start, n); + return l4dde26_request_region(start, n, name); + case IORESOURCE_MEM: + printk("MEM: name: %s, start: %x, len: %d\n", + name, start, n); + return l4dde26_request_mem_region(start, n, name); + } + + return NULL; +} + + +/** Release IO port region. + */ +static void l4dde26_release_region(resource_size_t start, resource_size_t n) +{ + /* FIXME: we need a list of "struct resource"s that have been + * allocated by request_region() and then need to + * free this stuff here! */ + ddekit_release_io(start, n); +} + + +/** Release IO memory region. + */ +static void l4dde26_release_mem_region(resource_size_t start, resource_size_t n) +{ + ddekit_release_mem(start, n); + ddekit_pgtab_clear_region((void *)start, PTE_TYPE_OTHER); +} + + +int __check_region(struct resource *root, resource_size_t s, resource_size_t n) +{ + WARN_UNIMPL; + return -1; +} + +void __release_region(struct resource *root, resource_size_t start, + resource_size_t n) +{ + switch (root->flags) + { + case IORESOURCE_IO: + return l4dde26_release_region(start, n); + case IORESOURCE_MEM: + return l4dde26_release_mem_region(start, n); + } +} + + +/** Map physical I/O region into virtual address space. + * + * For our sake, this only returns the virtual address belonging to + * the physical region, since we don't manage page tables ourselves. + */ +void __iomem * ioremap(unsigned long phys_addr, unsigned long size) +{ + struct list_head *pos, *head; + head = &dde_mem_regions; + + list_for_each(pos, head) { + struct dde_mem_region *mreg = list_entry(pos, struct dde_mem_region, + list); + if (mreg->pa <= phys_addr && mreg->pa + mreg->size >= phys_addr + size) + { + printk ("ioremap: phys: %x <-> virt: %x\n", phys_addr, + (mreg->va + (phys_addr - mreg->pa))); + return (void *)(mreg->va + (phys_addr - mreg->pa)); + } + } + + return NULL; +} + + +void __iomem * ioremap_nocache(unsigned long offset, unsigned long size) +{ + return ioremap(offset, size); +} + + +void iounmap(volatile void __iomem *addr) +{ + WARN_UNIMPL; +} diff --git a/libdde-linux26/lib/src/arch/l4/sched.c b/libdde-linux26/lib/src/arch/l4/sched.c new file mode 100644 index 00000000..b38520c6 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/sched.c @@ -0,0 +1,155 @@ +#include "local.h" + +#include <linux/sched.h> + +DEFINE_RWLOCK(tasklist_lock); + +asmlinkage void preempt_schedule(void) +{ + WARN_UNIMPL; +} + + +/* Our version of scheduler invocation. + * + * Scheduling is performed by Fiasco, so we don't care about it as long as + * a thread is running. If a task becomes TASK_INTERRUPTIBLE or + * TASK_UNINTERRUPTIBLE, we make sure that the task does not become + * scheduled by locking the task's sleep lock. + */ +asmlinkage void schedule(void) +{ + dde26_thread_data *t = lxtask_to_ddethread(current); + + switch (current->state) { + case TASK_RUNNING: + ddekit_thread_schedule(); + break; + case TASK_INTERRUPTIBLE: + case TASK_UNINTERRUPTIBLE: + ddekit_sem_down(SLEEP_LOCK(t)); + break; + default: + panic("current->state = %d --- unknown state\n", current->state); + } +} + + +/** yield the current processor to other threads. + * + * this is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + ddekit_yield(); +} + + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + */ +int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +{ + Assert(p); + dde26_thread_data *t = lxtask_to_ddethread(p); + + Assert(t); + Assert(SLEEP_LOCK(t)); + + p->state = TASK_RUNNING; + ddekit_sem_up(SLEEP_LOCK(t)); + + return 0; +} + + +static void process_timeout(unsigned long data) +{ + wake_up_process((struct task_struct *)data); +} + + +signed long __sched schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire = timeout + jiffies; + + setup_timer(&timer, process_timeout, (unsigned long)current); + timer.expires = expire; + + switch(timeout) + { + /* + * Hah! + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + */ + case MAX_SCHEDULE_TIMEOUT: + schedule(); + break; + default: + add_timer(&timer); + schedule(); + del_timer(&timer); + break; + } + + timeout = expire - jiffies; + + return timeout < 0 ? 0 : timeout; +} + + +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} + + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} + +/** Tasks may be forced to run only on a certain no. of CPUs. Since + * we only emulate a SMP-environment for the sake of having multiple + * threads, we do not need to implement this. + */ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return 0; +} + +void set_user_nice(struct task_struct *p, long nice) +{ + //WARN_UNIMPL; +} + +void __sched io_schedule(void) +{ + WARN_UNIMPL; +} + +long __sched io_schedule_timeout(long timeout) +{ + WARN_UNIMPL; + return -1; +} + +extern int sched_setscheduler_nocheck(struct task_struct *t, int flags, + struct sched_param *p) +{ + WARN_UNIMPL; + return -1; +} + +void ignore_signals(struct task_struct *t) { } diff --git a/libdde-linux26/lib/src/arch/l4/signal.c b/libdde-linux26/lib/src/arch/l4/signal.c new file mode 100644 index 00000000..bd0bc0a7 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/signal.c @@ -0,0 +1,24 @@ +#include "local.h" + +/****************************************************************************** + ** Dummy signal implementation. ** + ** DDE does not provide its own signal implementation. To make it compile, ** + ** we provide dummy versions of signalling functions here. If later on ** + ** someone *REALLY* wants to use signals in the DDE context, he might ** + ** erase this file and use something like the L4 signalling library for ** + ** such purposes. ** +*******************************************************************************/ + +int sigprocmask(int how, sigset_t *set, sigset_t *oldset) +{ + return 0; +} + +void flush_signals(struct task_struct *t) +{ +} + +int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) +{ + return 0; +} diff --git a/libdde-linux26/lib/src/arch/l4/smp.c b/libdde-linux26/lib/src/arch/l4/smp.c new file mode 100644 index 00000000..1ebf08c2 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/smp.c @@ -0,0 +1,37 @@ +#include <linux/cpumask.h> + +#include "local.h" + +static struct cpumask _possible = CPU_MASK_ALL; +static struct cpumask _online = CPU_MASK_CPU0; +static struct cpumask _present = CPU_MASK_CPU0; +static struct cpumask _active = CPU_MASK_CPU0; + +const struct cpumask *const cpu_possible_mask = &_possible; +const struct cpumask *const cpu_online_mask = &_online; +const struct cpumask *const cpu_present_mask = &_present; +const struct cpumask *const cpu_active_mask = &_active; + +cpumask_t cpu_mask_all = CPU_MASK_ALL; +int nr_cpu_ids = NR_CPUS; +const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); + +/* cpu_bit_bitmap[0] is empty - so we can back into it */ +#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) +#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) +#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) +#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) + +const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { + MASK_DECLARE_8(0), MASK_DECLARE_8(8), + MASK_DECLARE_8(16), MASK_DECLARE_8(24), +#if BITS_PER_LONG > 32 + MASK_DECLARE_8(32), MASK_DECLARE_8(40), + MASK_DECLARE_8(48), MASK_DECLARE_8(56), +#endif +}; + +void __smp_call_function_single(int cpuid, struct call_single_data *data) +{ + data->func(data->info); +} diff --git a/libdde-linux26/lib/src/arch/l4/softirq.c b/libdde-linux26/lib/src/arch/l4/softirq.c new file mode 100644 index 00000000..d93bfaff --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/softirq.c @@ -0,0 +1,283 @@ +#include "local.h" + +#include <linux/interrupt.h> + +/* There are at most 32 softirqs in Linux, but only 6 are really used. */ +#define NUM_SOFTIRQS 6 + +DECLARE_INITVAR(dde26_softirq); + +/* softirq threads and their wakeup semaphores */ +ddekit_thread_t *dde_softirq_thread; +ddekit_sem_t *dde_softirq_sem; + +/* struct tasklet_head is not defined in a header in Linux 2.6 */ +struct tasklet_head +{ + struct tasklet_struct *list; + ddekit_lock_t lock; /* list lock */ +}; + +/* What to do if a softirq occurs. */ +static struct softirq_action softirq_vec[32]; + +/* tasklet queues for each softirq thread */ +struct tasklet_head tasklet_vec; +struct tasklet_head tasklet_hi_vec; + +void open_softirq(int nr, void (*action)(struct softirq_action*)) +{ + softirq_vec[nr].action = action; +} + +static void raise_softirq_irqoff_cpu(unsigned int nr, unsigned int cpu) +{ + CHECK_INITVAR(dde26_softirq); + + /* mark softirq scheduled */ + __raise_softirq_irqoff(nr); + /* wake softirq thread */ + ddekit_sem_up(dde_softirq_sem); +} + +void raise_softirq_irqoff(unsigned int nr) +{ + raise_softirq_irqoff_cpu(nr, 0); +} + +void raise_softirq(unsigned int nr) +{ + unsigned long flags; + + local_irq_save(flags); + raise_softirq_irqoff(nr); + local_irq_restore(flags); +} + +/** + * Initialize tasklet. + */ +void tasklet_init(struct tasklet_struct *t, + void (*func)(unsigned long), unsigned long data) +{ + t->next = NULL; + t->state = 0; + atomic_set(&t->count, 0); + t->func = func; + t->data = data; +} + +void tasklet_kill(struct tasklet_struct *t) +{ + if (in_interrupt()) + printk("Attempt to kill tasklet from interrupt\n"); + + while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { + do + yield(); + while (test_bit(TASKLET_STATE_SCHED, &t->state)); + } + tasklet_unlock_wait(t); + clear_bit(TASKLET_STATE_SCHED, &t->state); +} + +/* enqueue tasklet */ +static void __tasklet_enqueue(struct tasklet_struct *t, + struct tasklet_head *listhead) +{ + ddekit_lock_lock(&listhead->lock); + t->next = listhead->list; + listhead->list = t; + ddekit_lock_unlock(&listhead->lock); +} + +void __tasklet_schedule(struct tasklet_struct *t) +{ + unsigned long flags; + + CHECK_INITVAR(dde26_softirq); + + local_irq_save(flags); + + __tasklet_enqueue(t, &tasklet_vec); + /* raise softirq */ + raise_softirq_irqoff_cpu(TASKLET_SOFTIRQ, 0); + + local_irq_restore(flags); +} + +void __tasklet_hi_schedule(struct tasklet_struct *t) +{ + unsigned long flags; + + CHECK_INITVAR(dde26_softirq); + + local_irq_save(flags); + __tasklet_enqueue(t, &tasklet_hi_vec); + raise_softirq_irqoff_cpu(HI_SOFTIRQ, 0); + local_irq_restore(flags); +} + +/* Execute tasklets */ +static void tasklet_action(struct softirq_action *a) +{ + struct tasklet_struct *list; + + ddekit_lock_lock(&tasklet_vec.lock); + list = tasklet_vec.list; + tasklet_vec.list = NULL; + ddekit_lock_unlock(&tasklet_vec.lock); + + while (list) { + struct tasklet_struct *t = list; + + list = list->next; + + if (tasklet_trylock(t)) { + if (!atomic_read(&t->count)) { + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + BUG(); + t->func(t->data); + tasklet_unlock(t); + continue; + } + tasklet_unlock(t); + } + + ddekit_lock_lock(&tasklet_vec.lock); + t->next = tasklet_vec.list; + tasklet_vec.list = t; + raise_softirq_irqoff_cpu(TASKLET_SOFTIRQ, 0); + ddekit_lock_unlock(&tasklet_vec.lock); + } +} + + +static void tasklet_hi_action(struct softirq_action *a) +{ + struct tasklet_struct *list; + + ddekit_lock_lock(&tasklet_hi_vec.lock); + list = tasklet_hi_vec.list; + tasklet_hi_vec.list = NULL; + ddekit_lock_unlock(&tasklet_hi_vec.lock); + + while (list) { + struct tasklet_struct *t = list; + + list = list->next; + + if (tasklet_trylock(t)) { + if (!atomic_read(&t->count)) { + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + BUG(); + t->func(t->data); + tasklet_unlock(t); + continue; + } + tasklet_unlock(t); + } + + ddekit_lock_lock(&tasklet_hi_vec.lock); + t->next = tasklet_hi_vec.list; + tasklet_hi_vec.list = t; + raise_softirq_irqoff_cpu(HI_SOFTIRQ, 0); + ddekit_lock_unlock(&tasklet_hi_vec.lock); + } +} + + +#define MAX_SOFTIRQ_RETRIES 10 + +/** Run softirq handlers + */ +asmlinkage +void __do_softirq(void) +{ + int retries = MAX_SOFTIRQ_RETRIES; + do { + struct softirq_action *h = softirq_vec; + unsigned long pending = local_softirq_pending(); + + /* reset softirq count */ + set_softirq_pending(0); + + /* While we have a softirq pending... */ + while (pending) { + /* need to execute current softirq? */ + if (pending & 1) + h->action(h); + /* try next softirq */ + h++; + /* remove pending flag for last softirq */ + pending >>= 1; + } + + /* Somebody might have scheduled another softirq in between + * (e.g., an IRQ thread or another tasklet). */ + } while (local_softirq_pending() && --retries); + +} + + +asmlinkage +void do_softirq(void) +{ + unsigned long flags; + + local_irq_save(flags); + if (local_softirq_pending()) + __do_softirq(); + local_irq_restore(flags); +} + +/** Softirq thread function. + * + * Once started, a softirq thread waits for tasklets to be scheduled + * and executes them. + * + * \param arg # of this softirq thread so that it grabs the correct lock + * if multiple softirq threads are running. + */ +void l4dde26_softirq_thread(void *arg) +{ + printk("Softirq daemon starting\n"); + l4dde26_process_add_worker(); + + /* This thread will always be in a softirq, so set the + * corresponding flag right now. + */ + preempt_count() |= SOFTIRQ_MASK; + + while(1) { + ddekit_sem_down(dde_softirq_sem); + do_softirq(); + } +} + +/** Initialize softirq subsystem. + * + * Start NUM_SOFTIRQ_THREADS threads executing the \ref l4dde26_softirq_thread + * function. + */ +void l4dde26_softirq_init(void) +{ + char name[20]; + + dde_softirq_sem = ddekit_sem_init(0); + + set_softirq_pending(0); + + ddekit_lock_init_unlocked(&tasklet_vec.lock); + ddekit_lock_init_unlocked(&tasklet_hi_vec.lock); + + snprintf(name, 20, ".softirqd"); + dde_softirq_thread = ddekit_thread_create( + l4dde26_softirq_thread, + NULL, name); + + open_softirq(TASKLET_SOFTIRQ, tasklet_action); + open_softirq(HI_SOFTIRQ, tasklet_hi_action); + + INITIALIZE_INITVAR(dde26_softirq); +} diff --git a/libdde-linux26/lib/src/arch/l4/timer.c b/libdde-linux26/lib/src/arch/l4/timer.c new file mode 100644 index 00000000..b85c83a9 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/timer.c @@ -0,0 +1,163 @@ +#include "local.h" + +#include <linux/timer.h> +#include <linux/fs.h> +#include <asm/delay.h> + +DECLARE_INITVAR(dde26_timer); + +/* Definitions from linux/kernel/timer.c */ + +/* + * per-CPU timer vector definitions: + */ +#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) +#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) + +typedef struct tvec_s { + struct list_head vec[TVN_SIZE]; +} tvec_t; + +typedef struct tvec_root_s { + struct list_head vec[TVR_SIZE]; +} tvec_root_t; + +struct tvec_base { + spinlock_t lock; + struct timer_list *running_timer; + unsigned long timer_jiffies; + tvec_root_t tv1; + tvec_t tv2; + tvec_t tv3; + tvec_t tv4; + tvec_t tv5; +} ____cacheline_aligned_in_smp; + +typedef struct tvec_t_base_s tvec_base_t; + +struct tvec_base boot_tvec_bases __attribute__((unused)); + +static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) __attribute__((unused)) = &boot_tvec_bases; + +void init_timer(struct timer_list *timer) +{ + timer->ddekit_timer_id = DDEKIT_INVALID_TIMER_ID; +} + +void add_timer(struct timer_list *timer) +{ + CHECK_INITVAR(dde26_timer); + /* DDE2.6 uses jiffies and HZ as exported from L4IO. Therefore + * we just need to hand over the timeout to DDEKit. */ + timer->ddekit_timer_id = ddekit_add_timer((void *)timer->function, + (void *)timer->data, + timer->expires); +} + + +void add_timer_on(struct timer_list *timer, int cpu) +{ + add_timer(timer); +} + + +int del_timer(struct timer_list * timer) +{ + int ret; + CHECK_INITVAR(dde26_timer); + ret = ddekit_del_timer(timer->ddekit_timer_id); + timer->ddekit_timer_id = DDEKIT_INVALID_TIMER_ID; + + return ret >= 0; +} + +int del_timer_sync(struct timer_list *timer) +{ + return del_timer(timer); +} + + +int __mod_timer(struct timer_list *timer, unsigned long expires) +{ + /* XXX: Naive implementation. If we really need to be fast with + * this function, we can implement a faster version inside + * the DDEKit. Bjoern just does not think that this is the + * case. + */ + int r; + + CHECK_INITVAR(dde26_timer); + r = del_timer(timer); + + timer->expires = expires; + add_timer(timer); + + return (r > 0); +} + + +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires); +} + + +int timer_pending(const struct timer_list *timer) +{ + CHECK_INITVAR(dde26_timer); + /* There must be a valid DDEKit timer ID in the timer field + * *AND* it must be pending in the DDEKit. + */ + return ((timer->ddekit_timer_id != DDEKIT_INVALID_TIMER_ID) + && ddekit_timer_pending(timer->ddekit_timer_id)); +} + + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +void msleep(unsigned int msecs) +{ + ddekit_thread_msleep(msecs); +} + +void __init l4dde26_init_timers(void) +{ + ddekit_init_timers(); + + l4dde26_process_from_ddekit(ddekit_get_timer_thread()); + + INITIALIZE_INITVAR(dde26_timer); +} + +core_initcall(l4dde26_init_timers); + +__attribute__((weak)) void do_gettimeofday (struct timeval *tv) +{ + WARN_UNIMPL; +} + +struct timespec current_fs_time(struct super_block *sb) +{ + struct timespec now = {0,0}; + WARN_UNIMPL; + return now; +} + +ktime_t ktime_get_real(void) +{ + struct timespec now = {0,0}; + WARN_UNIMPL; + return timespec_to_ktime(now); +} + + +void native_io_delay(void) +{ + udelay(2); +} diff --git a/libdde-linux26/lib/src/arch/l4/vmalloc.c b/libdde-linux26/lib/src/arch/l4/vmalloc.c new file mode 100644 index 00000000..4fa063f0 --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/vmalloc.c @@ -0,0 +1,28 @@ +/****************************************************************************** + * Bjoern Doebel <doebel@tudos.org> * + * * + * (c) 2005 - 2007 Technische Universitaet Dresden * + * This file is part of DROPS, which is distributed under the terms of the * + * GNU General Public License 2. Please see the COPYING file for details. * + ******************************************************************************/ + +/* + * \brief vmalloc implementation + * \author Bjoern Doebel + * \date 2007-07-30 + */ + +/* Linux */ +#include <linux/vmalloc.h> + +#include "local.h" + +void *vmalloc(unsigned long size) +{ + return ddekit_simple_malloc(size); +} + +void vfree(const void *addr) +{ + ddekit_simple_free((void*)addr); +} diff --git a/libdde-linux26/lib/src/arch/l4/vmstat.c b/libdde-linux26/lib/src/arch/l4/vmstat.c new file mode 100644 index 00000000..2e87389e --- /dev/null +++ b/libdde-linux26/lib/src/arch/l4/vmstat.c @@ -0,0 +1,34 @@ +#include "local.h" + +#include <linux/fs.h> + +atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; + + +void dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + WARN_UNIMPL; +} + + +void inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + WARN_UNIMPL; +} + + +void __inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + WARN_UNIMPL; +} + +void __get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, struct pglist_data *pgdat) +{ + WARN_UNIMPL; +} + +void __dec_zone_state(struct zone *zone, enum zone_stat_item item) +{ + WARN_UNIMPL; +} diff --git a/libdde-linux26/lib/src/arch/x86/kernel/setup.c b/libdde-linux26/lib/src/arch/x86/kernel/setup.c new file mode 100644 index 00000000..ef285c0d --- /dev/null +++ b/libdde-linux26/lib/src/arch/x86/kernel/setup.c @@ -0,0 +1,114 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons <orc@pell.chi.il.us>, July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle <bmoyle@mvista.com>, February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel <mochel@osdl.org>, March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach <xela@slit.de>, December 2002. + * + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/screen_info.h> +#include <linux/ioport.h> +#include <linux/acpi.h> +#include <linux/apm_bios.h> +#include <linux/initrd.h> +#include <linux/bootmem.h> +#include <linux/seq_file.h> +#include <linux/console.h> +#include <linux/mca.h> +#include <linux/root_dev.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/edd.h> +#include <linux/iscsi_ibft.h> +#include <linux/nodemask.h> +#include <linux/kexec.h> +#include <linux/dmi.h> +#include <linux/pfn.h> +#include <linux/pci.h> +#include <asm/pci-direct.h> +#include <linux/init_ohci1394_dma.h> +#include <linux/kvm_para.h> + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/user.h> +#include <linux/delay.h> + +#include <linux/kallsyms.h> +#include <linux/cpufreq.h> +#include <linux/dma-mapping.h> +#include <linux/ctype.h> +#include <linux/uaccess.h> + +#include <linux/percpu.h> +#include <linux/crash_dump.h> + +#include <video/edid.h> + +#include <asm/mtrr.h> +#include <asm/apic.h> +#include <asm/e820.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/arch_hooks.h> +#include <asm/efi.h> +#include <asm/sections.h> +#include <asm/dmi.h> +#include <asm/io_apic.h> +#include <asm/ist.h> +#include <asm/vmi.h> +#include <setup_arch.h> +#include <asm/bios_ebda.h> +#include <asm/cacheflush.h> +#include <asm/processor.h> +#include <asm/bugs.h> + +#include <asm/system.h> +#include <asm/vsyscall.h> +#include <asm/smp.h> +#include <asm/desc.h> +#include <asm/dma.h> +#include <asm/iommu.h> +#include <asm/gart.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> + +#include <mach_apic.h> +#include <asm/paravirt.h> +#include <asm/hypervisor.h> + +#include <asm/percpu.h> +#include <asm/topology.h> +#include <asm/apicdef.h> +#ifdef CONFIG_X86_64 +#include <asm/numa_64.h> +#endif + +/* common cpu data for all cpus */ +/* XXX: Asserting >= 586 */ +struct cpuinfo_x86 boot_cpu_data __read_mostly = {5, 0, 0, 0, -1, 1, 0, 0, -1}; +EXPORT_SYMBOL(boot_cpu_data); diff --git a/libdde-linux26/lib/src/arch/x86/lib/delay.c b/libdde-linux26/lib/src/arch/x86/lib/delay.c new file mode 100644 index 00000000..6097c6bb --- /dev/null +++ b/libdde-linux26/lib/src/arch/x86/lib/delay.c @@ -0,0 +1,138 @@ +/* + * Precise Delay Loops for i386 + * + * Copyright (C) 1993 Linus Torvalds + * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * Copyright (C) 2008 Jiri Hladky <hladky _dot_ jiri _at_ gmail _dot_ com> + * + * The __delay function must _NOT_ be inlined as its execution time + * depends wildly on alignment on many x86 processors. The additional + * jump magic is needed to get the timing stable on all the CPU's + * we have to worry about. + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/timex.h> +#include <linux/preempt.h> +#include <linux/delay.h> +#include <linux/init.h> + +#include <asm/processor.h> +#include <asm/delay.h> +#include <asm/timer.h> + +#ifdef CONFIG_SMP +# include <asm/smp.h> +#endif +#include <ddekit/timer.h> + +/* simple loop based delay: */ +static void delay_loop(unsigned long loops) +{ + asm volatile( + " test %0,%0 \n" + " jz 3f \n" + " jmp 1f \n" + + ".align 16 \n" + "1: jmp 2f \n" + + ".align 16 \n" + "2: dec %0 \n" + " jnz 2b \n" + "3: dec %0 \n" + + : /* we don't need output */ + :"a" (loops) + ); +} + +/* TSC based delay: */ +static void delay_tsc(unsigned long loops) +{ + unsigned long bclock, now; + int cpu; + + preempt_disable(); + cpu = smp_processor_id(); + rdtscl(bclock); + for (;;) { + rdtscl(now); + if ((now - bclock) >= loops) + break; + + /* Allow RT tasks to run */ + preempt_enable(); + rep_nop(); + preempt_disable(); + + /* + * It is possible that we moved to another CPU, and + * since TSC's are per-cpu we need to calculate + * that. The delay must guarantee that we wait "at + * least" the amount of time. Being moved to another + * CPU could make the wait longer but we just need to + * make sure we waited long enough. Rebalance the + * counter for this CPU. + */ + if (unlikely(cpu != smp_processor_id())) { + loops -= (now - bclock); + cpu = smp_processor_id(); + rdtscl(bclock); + } + } + preempt_enable(); +} + +/* + * Since we calibrate only once at boot, this + * function should be set once at boot and not changed + */ +static void (*delay_fn)(unsigned long) = delay_loop; + +void use_tsc_delay(void) +{ + delay_fn = delay_tsc; +} + +int __devinit read_current_timer(unsigned long *timer_val) +{ + if (delay_fn == delay_tsc) { + rdtscll(*timer_val); + return 0; + } + return -1; +} + +void __delay(unsigned long loops) +{ + delay_fn(loops); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ + int d0; + + xloops *= 4; + asm("mull %%edx" + :"=d" (xloops), "=&a" (d0) + :"1" (xloops), "0" + (loops_per_jiffy * (HZ/4))); + + __delay(++xloops); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/libdde-linux26/lib/src/arch/x86/lib/semaphore_32.S b/libdde-linux26/lib/src/arch/x86/lib/semaphore_32.S new file mode 100644 index 00000000..1850ca50 --- /dev/null +++ b/libdde-linux26/lib/src/arch/x86/lib/semaphore_32.S @@ -0,0 +1,138 @@ +/* + * i386 semaphore implementation. + * + * (C) Copyright 1999 Linus Torvalds + * + * Portions Copyright 1999 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> + */ + +#include <linux/linkage.h> +#include <asm/rwlock.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/dwarf2.h> + +/* + * The semaphore operations have a special calling sequence that + * allow us to do a simpler in-line version of them. These routines + * need to convert that sequence back into the C sequence when + * there is contention on the semaphore. + * + * %eax contains the semaphore pointer on entry. Save the C-clobbered + * registers (%eax, %edx and %ecx) except %eax whish is either a return + * value or just clobbered.. + */ +#ifndef DDE_LINUX + .section .sched.text, "ax" +#endif + +/* + * rw spinlock fallbacks + */ +#ifdef CONFIG_SMP +ENTRY(__write_lock_failed) + CFI_STARTPROC simple + FRAME +2: LOCK_PREFIX + addl $ RW_LOCK_BIAS,(%eax) +1: rep; nop + cmpl $ RW_LOCK_BIAS,(%eax) + jne 1b + LOCK_PREFIX + subl $ RW_LOCK_BIAS,(%eax) + jnz 2b + ENDFRAME + ret + CFI_ENDPROC + ENDPROC(__write_lock_failed) + +ENTRY(__read_lock_failed) + CFI_STARTPROC + FRAME +2: LOCK_PREFIX + incl (%eax) +1: rep; nop + cmpl $1,(%eax) + js 1b + LOCK_PREFIX + decl (%eax) + js 2b + ENDFRAME + ret + CFI_ENDPROC + ENDPROC(__read_lock_failed) + +#endif + +#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_down_read_failed) + CFI_STARTPROC + push %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + push %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx,0 + call rwsem_down_read_failed + pop %edx + CFI_ADJUST_CFA_OFFSET -4 + pop %ecx + CFI_ADJUST_CFA_OFFSET -4 + ret + CFI_ENDPROC + ENDPROC(call_rwsem_down_read_failed) + +ENTRY(call_rwsem_down_write_failed) + CFI_STARTPROC + push %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + calll rwsem_down_write_failed + pop %ecx + CFI_ADJUST_CFA_OFFSET -4 + ret + CFI_ENDPROC + ENDPROC(call_rwsem_down_write_failed) + +ENTRY(call_rwsem_wake) + CFI_STARTPROC + decw %dx /* do nothing if still outstanding active readers */ + jnz 1f + push %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + call rwsem_wake + pop %ecx + CFI_ADJUST_CFA_OFFSET -4 +1: ret + CFI_ENDPROC + ENDPROC(call_rwsem_wake) + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_downgrade_wake) + CFI_STARTPROC + push %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx,0 + push %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx,0 + call rwsem_downgrade_wake + pop %edx + CFI_ADJUST_CFA_OFFSET -4 + pop %ecx + CFI_ADJUST_CFA_OFFSET -4 + ret + CFI_ENDPROC + ENDPROC(call_rwsem_downgrade_wake) + +#endif diff --git a/libdde-linux26/lib/src/block/blk-core.c b/libdde-linux26/lib/src/block/blk-core.c new file mode 100644 index 00000000..3104b543 --- /dev/null +++ b/libdde-linux26/lib/src/block/blk-core.c @@ -0,0 +1,2175 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1994, Karl Keyte: Added support for disk statistics + * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE + * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> + * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> + * - July2000 + * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 + */ + +/* + * This handles all read/write requests to block devices + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/highmem.h> +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/blktrace_api.h> +#include <linux/fault-inject.h> +#include <trace/block.h> + +#include "blk.h" + +#include <ddekit/timer.h> + +DEFINE_TRACE(block_plug); +DEFINE_TRACE(block_unplug_io); +DEFINE_TRACE(block_unplug_timer); +DEFINE_TRACE(block_getrq); +DEFINE_TRACE(block_sleeprq); +DEFINE_TRACE(block_rq_requeue); +DEFINE_TRACE(block_bio_backmerge); +DEFINE_TRACE(block_bio_frontmerge); +DEFINE_TRACE(block_bio_queue); +DEFINE_TRACE(block_rq_complete); +DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */ +EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); + +static int __make_request(struct request_queue *q, struct bio *bio); + +/* + * For the allocated request tables + */ +static struct kmem_cache *request_cachep; + +/* + * For queue allocation + */ +struct kmem_cache *blk_requestq_cachep; + +/* + * Controlling structure to kblockd + */ +static struct workqueue_struct *kblockd_workqueue; + +static void drive_stat_acct(struct request *rq, int new_io) +{ + struct gendisk *disk = rq->rq_disk; + struct hd_struct *part; + int rw = rq_data_dir(rq); + int cpu; + + if (!blk_fs_request(rq) || !disk || !blk_do_io_stat(disk->queue)) + return; + + cpu = part_stat_lock(); + part = disk_map_sector_rcu(rq->rq_disk, rq->sector); + + if (!new_io) + part_stat_inc(cpu, part, merges[rw]); + else { + part_round_stats(cpu, part); + part_inc_in_flight(part); + } + + part_stat_unlock(); +} + +void blk_queue_congestion_threshold(struct request_queue *q) +{ + int nr; + + nr = q->nr_requests - (q->nr_requests / 8) + 1; + if (nr > q->nr_requests) + nr = q->nr_requests; + q->nr_congestion_on = nr; + + nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; + if (nr < 1) + nr = 1; + q->nr_congestion_off = nr; +} + +/** + * blk_get_backing_dev_info - get the address of a queue's backing_dev_info + * @bdev: device + * + * Locates the passed device's request queue and returns the address of its + * backing_dev_info + * + * Will return NULL if the request queue cannot be located. + */ +struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) +{ + struct backing_dev_info *ret = NULL; + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + ret = &q->backing_dev_info; + return ret; +} +EXPORT_SYMBOL(blk_get_backing_dev_info); + +void blk_rq_init(struct request_queue *q, struct request *rq) +{ + memset(rq, 0, sizeof(*rq)); + + INIT_LIST_HEAD(&rq->queuelist); + INIT_LIST_HEAD(&rq->timeout_list); + rq->cpu = -1; + rq->q = q; + rq->sector = rq->hard_sector = (sector_t) -1; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + rq->cmd = rq->__cmd; + rq->tag = -1; + rq->ref_count = 1; +} +EXPORT_SYMBOL(blk_rq_init); + +static void req_bio_endio(struct request *rq, struct bio *bio, + unsigned int nbytes, int error) +{ + struct request_queue *q = rq->q; + + if (&q->bar_rq != rq) { + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; + + if (unlikely(nbytes > bio->bi_size)) { + printk(KERN_ERR "%s: want %u bytes done, %u left\n", + __func__, nbytes, bio->bi_size); + nbytes = bio->bi_size; + } + + if (unlikely(rq->cmd_flags & REQ_QUIET)) + set_bit(BIO_QUIET, &bio->bi_flags); + + bio->bi_size -= nbytes; + bio->bi_sector += (nbytes >> 9); + + if (bio_integrity(bio)) + bio_integrity_advance(bio, nbytes); + + if (bio->bi_size == 0) + bio_endio(bio, error); + } else { + + /* + * Okay, this is the barrier request in progress, just + * record the error; + */ + if (error && !q->orderr) + q->orderr = error; + } +} + +void blk_dump_rq_flags(struct request *rq, char *msg) +{ + int bit; + + printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, + rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, + rq->cmd_flags); + + printk(KERN_INFO " sector %llu, nr/cnr %lu/%u\n", + (unsigned long long)rq->sector, + rq->nr_sectors, + rq->current_nr_sectors); + printk(KERN_INFO " bio %p, biotail %p, buffer %p, data %p, len %u\n", + rq->bio, rq->biotail, + rq->buffer, rq->data, + rq->data_len); + + if (blk_pc_request(rq)) { + printk(KERN_INFO " cdb: "); + for (bit = 0; bit < BLK_MAX_CDB; bit++) + printk("%02x ", rq->cmd[bit]); + printk("\n"); + } +} +EXPORT_SYMBOL(blk_dump_rq_flags); + +/* + * "plug" the device if there are no outstanding requests: this will + * force the transfer to start only after we have put all the requests + * on the list. + * + * This is called with interrupts off and no requests on the queue and + * with the queue lock held. + */ +void blk_plug_device(struct request_queue *q) +{ + WARN_ON(!irqs_disabled()); + + /* + * don't plug a stopped queue, it must be paired with blk_start_queue() + * which will restart the queueing + */ + if (blk_queue_stopped(q)) + return; + + if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { + mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); + trace_block_plug(q); + } +} +EXPORT_SYMBOL(blk_plug_device); + +/** + * blk_plug_device_unlocked - plug a device without queue lock held + * @q: The &struct request_queue to plug + * + * Description: + * Like @blk_plug_device(), but grabs the queue lock and disables + * interrupts. + **/ +void blk_plug_device_unlocked(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + blk_plug_device(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} +EXPORT_SYMBOL(blk_plug_device_unlocked); + +/* + * remove the queue from the plugged list, if present. called with + * queue lock held and interrupts disabled. + */ +int blk_remove_plug(struct request_queue *q) +{ + WARN_ON(!irqs_disabled()); + + if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) + return 0; + + del_timer(&q->unplug_timer); + return 1; +} +EXPORT_SYMBOL(blk_remove_plug); + +/* + * remove the plug and let it rip.. + */ +void __generic_unplug_device(struct request_queue *q) +{ + if (unlikely(blk_queue_stopped(q))) + return; + if (!blk_remove_plug(q) && !blk_queue_nonrot(q)) + return; + + q->request_fn(q); +} + +/** + * generic_unplug_device - fire a request queue + * @q: The &struct request_queue in question + * + * Description: + * Linux uses plugging to build bigger requests queues before letting + * the device have at them. If a queue is plugged, the I/O scheduler + * is still adding and merging requests on the queue. Once the queue + * gets unplugged, the request_fn defined for the queue is invoked and + * transfers started. + **/ +void generic_unplug_device(struct request_queue *q) +{ + if (blk_queue_plugged(q)) { + spin_lock_irq(q->queue_lock); + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); + } +} +EXPORT_SYMBOL(generic_unplug_device); + +static void blk_backing_dev_unplug(struct backing_dev_info *bdi, + struct page *page) +{ + struct request_queue *q = bdi->unplug_io_data; + + blk_unplug(q); +} + +void blk_unplug_work(struct work_struct *work) +{ + struct request_queue *q = + container_of(work, struct request_queue, unplug_work); + + trace_block_unplug_io(q); + q->unplug_fn(q); +} + +void blk_unplug_timeout(unsigned long data) +{ + struct request_queue *q = (struct request_queue *)data; + + trace_block_unplug_timer(q); + kblockd_schedule_work(q, &q->unplug_work); +} + +void blk_unplug(struct request_queue *q) +{ + /* + * devices don't necessarily have an ->unplug_fn defined + */ + if (q->unplug_fn) { + trace_block_unplug_io(q); + q->unplug_fn(q); + } +} +EXPORT_SYMBOL(blk_unplug); + +static void blk_invoke_request_fn(struct request_queue *q) +{ + if (unlikely(blk_queue_stopped(q))) + return; + + /* + * one level of recursion is ok and is much faster than kicking + * the unplug handling + */ + if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { + q->request_fn(q); + queue_flag_clear(QUEUE_FLAG_REENTER, q); + } else { + queue_flag_set(QUEUE_FLAG_PLUGGED, q); + kblockd_schedule_work(q, &q->unplug_work); + } +} + +/** + * blk_start_queue - restart a previously stopped queue + * @q: The &struct request_queue in question + * + * Description: + * blk_start_queue() will clear the stop flag on the queue, and call + * the request_fn for the queue if it was in a stopped state when + * entered. Also see blk_stop_queue(). Queue lock must be held. + **/ +void blk_start_queue(struct request_queue *q) +{ + WARN_ON(!irqs_disabled()); + + queue_flag_clear(QUEUE_FLAG_STOPPED, q); + blk_invoke_request_fn(q); +} +EXPORT_SYMBOL(blk_start_queue); + +/** + * blk_stop_queue - stop a queue + * @q: The &struct request_queue in question + * + * Description: + * The Linux block layer assumes that a block driver will consume all + * entries on the request queue when the request_fn strategy is called. + * Often this will not happen, because of hardware limitations (queue + * depth settings). If a device driver gets a 'queue full' response, + * or if it simply chooses not to queue more I/O at one point, it can + * call this function to prevent the request_fn from being called until + * the driver has signalled it's ready to go again. This happens by calling + * blk_start_queue() to restart queue operations. Queue lock must be held. + **/ +void blk_stop_queue(struct request_queue *q) +{ + blk_remove_plug(q); + queue_flag_set(QUEUE_FLAG_STOPPED, q); +} +EXPORT_SYMBOL(blk_stop_queue); + +/** + * blk_sync_queue - cancel any pending callbacks on a queue + * @q: the queue + * + * Description: + * The block layer may perform asynchronous callback activity + * on a queue, such as calling the unplug function after a timeout. + * A block device may call blk_sync_queue to ensure that any + * such activity is cancelled, thus allowing it to release resources + * that the callbacks might use. The caller must already have made sure + * that its ->make_request_fn will not re-add plugging prior to calling + * this function. + * + */ +void blk_sync_queue(struct request_queue *q) +{ + del_timer_sync(&q->unplug_timer); + del_timer_sync(&q->timeout); + cancel_work_sync(&q->unplug_work); +} +EXPORT_SYMBOL(blk_sync_queue); + +/** + * __blk_run_queue - run a single device queue + * @q: The queue to run + * + * Description: + * See @blk_run_queue. This variant must be called with the queue lock + * held and interrupts disabled. + * + */ +void __blk_run_queue(struct request_queue *q) +{ + blk_remove_plug(q); + + /* + * Only recurse once to avoid overrunning the stack, let the unplug + * handling reinvoke the handler shortly if we already got there. + */ + if (!elv_queue_empty(q)) + blk_invoke_request_fn(q); +} +EXPORT_SYMBOL(__blk_run_queue); + +/** + * blk_run_queue - run a single device queue + * @q: The queue to run + * + * Description: + * Invoke request handling on this queue, if it has pending work to do. + * May be used to restart queueing when a request has completed. Also + * See @blk_start_queueing. + * + */ +void blk_run_queue(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __blk_run_queue(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} +EXPORT_SYMBOL(blk_run_queue); + +void blk_put_queue(struct request_queue *q) +{ + kobject_put(&q->kobj); +} + +void blk_cleanup_queue(struct request_queue *q) +{ + /* + * We know we have process context here, so we can be a little + * cautious and ensure that pending block actions on this device + * are done before moving on. Going into this function, we should + * not have processes doing IO to this device. + */ + blk_sync_queue(q); + + mutex_lock(&q->sysfs_lock); + queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); + mutex_unlock(&q->sysfs_lock); + + if (q->elevator) + elevator_exit(q->elevator); + + blk_put_queue(q); +} +EXPORT_SYMBOL(blk_cleanup_queue); + +static int blk_init_free_list(struct request_queue *q) +{ + struct request_list *rl = &q->rq; + + rl->count[READ] = rl->count[WRITE] = 0; + rl->starved[READ] = rl->starved[WRITE] = 0; + rl->elvpriv = 0; + init_waitqueue_head(&rl->wait[READ]); + init_waitqueue_head(&rl->wait[WRITE]); + + rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, + mempool_free_slab, request_cachep, q->node); + + if (!rl->rq_pool) + return -ENOMEM; + + return 0; +} + +struct request_queue *blk_alloc_queue(gfp_t gfp_mask) +{ + return blk_alloc_queue_node(gfp_mask, -1); +} +EXPORT_SYMBOL(blk_alloc_queue); + +struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) +{ + struct request_queue *q; + int err; + + q = kmem_cache_alloc_node(blk_requestq_cachep, + gfp_mask | __GFP_ZERO, node_id); + if (!q) + return NULL; + + q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; + q->backing_dev_info.unplug_io_data = q; + err = bdi_init(&q->backing_dev_info); + if (err) { + kmem_cache_free(blk_requestq_cachep, q); + return NULL; + } + + init_timer(&q->unplug_timer); + setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); + INIT_LIST_HEAD(&q->timeout_list); + INIT_WORK(&q->unplug_work, blk_unplug_work); + + kobject_init(&q->kobj, &blk_queue_ktype); + + mutex_init(&q->sysfs_lock); + spin_lock_init(&q->__queue_lock); + + return q; +} +EXPORT_SYMBOL(blk_alloc_queue_node); + +/** + * blk_init_queue - prepare a request queue for use with a block device + * @rfn: The function to be called to process requests that have been + * placed on the queue. + * @lock: Request queue spin lock + * + * Description: + * If a block device wishes to use the standard request handling procedures, + * which sorts requests and coalesces adjacent requests, then it must + * call blk_init_queue(). The function @rfn will be called when there + * are requests on the queue that need to be processed. If the device + * supports plugging, then @rfn may not be called immediately when requests + * are available on the queue, but may be called at some time later instead. + * Plugged queues are generally unplugged when a buffer belonging to one + * of the requests on the queue is needed, or due to memory pressure. + * + * @rfn is not required, or even expected, to remove all requests off the + * queue, but only as many as it can handle at a time. If it does leave + * requests on the queue, it is responsible for arranging that the requests + * get dealt with eventually. + * + * The queue spin lock must be held while manipulating the requests on the + * request queue; this lock will be taken also from interrupt context, so irq + * disabling is needed for it. + * + * Function returns a pointer to the initialized request queue, or %NULL if + * it didn't succeed. + * + * Note: + * blk_init_queue() must be paired with a blk_cleanup_queue() call + * when the block device is deactivated (such as at module unload). + **/ + +struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) +{ + return blk_init_queue_node(rfn, lock, -1); +} +EXPORT_SYMBOL(blk_init_queue); + +struct request_queue * +blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) +{ + struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); + + if (!q) + return NULL; + + q->node = node_id; + if (blk_init_free_list(q)) { + kmem_cache_free(blk_requestq_cachep, q); + return NULL; + } + + /* + * if caller didn't supply a lock, they get per-queue locking with + * our embedded lock + */ + if (!lock) + lock = &q->__queue_lock; + + q->request_fn = rfn; + q->prep_rq_fn = NULL; + q->unplug_fn = generic_unplug_device; + q->queue_flags = QUEUE_FLAG_DEFAULT; + q->queue_lock = lock; + + blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK); + + blk_queue_make_request(q, __make_request); + blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); + + blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); + blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); + + q->sg_reserved_size = INT_MAX; + + blk_set_cmd_filter_defaults(&q->cmd_filter); + + /* + * all done + */ + if (!elevator_init(q, NULL)) { + blk_queue_congestion_threshold(q); + return q; + } + + blk_put_queue(q); + return NULL; +} +EXPORT_SYMBOL(blk_init_queue_node); + +int blk_get_queue(struct request_queue *q) +{ + if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { + kobject_get(&q->kobj); + return 0; + } + + return 1; +} + +static inline void blk_free_request(struct request_queue *q, struct request *rq) +{ + if (rq->cmd_flags & REQ_ELVPRIV) + elv_put_request(q, rq); + mempool_free(rq, q->rq.rq_pool); +} + +static struct request * +blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) +{ + struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); + + if (!rq) + return NULL; + + blk_rq_init(q, rq); + + rq->cmd_flags = rw | REQ_ALLOCED; + + if (priv) { + if (unlikely(elv_set_request(q, rq, gfp_mask))) { + mempool_free(rq, q->rq.rq_pool); + return NULL; + } + rq->cmd_flags |= REQ_ELVPRIV; + } + + return rq; +} + +/* + * ioc_batching returns true if the ioc is a valid batching request and + * should be given priority access to a request. + */ +static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) +{ + if (!ioc) + return 0; + + /* + * Make sure the process is able to allocate at least 1 request + * even if the batch times out, otherwise we could theoretically + * lose wakeups. + */ + return ioc->nr_batch_requests == q->nr_batching || + (ioc->nr_batch_requests > 0 + && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); +} + +/* + * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This + * will cause the process to be a "batcher" on all queues in the system. This + * is the behaviour we want though - once it gets a wakeup it should be given + * a nice run. + */ +static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) +{ + if (!ioc || ioc_batching(q, ioc)) + return; + + ioc->nr_batch_requests = q->nr_batching; + ioc->last_waited = jiffies; +} + +static void __freed_request(struct request_queue *q, int rw) +{ + struct request_list *rl = &q->rq; + + if (rl->count[rw] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, rw); + + if (rl->count[rw] + 1 <= q->nr_requests) { + if (waitqueue_active(&rl->wait[rw])) + wake_up(&rl->wait[rw]); + + blk_clear_queue_full(q, rw); + } +} + +/* + * A request has just been released. Account for it, update the full and + * congestion status, wake up any waiters. Called under q->queue_lock. + */ +static void freed_request(struct request_queue *q, int rw, int priv) +{ + struct request_list *rl = &q->rq; + + rl->count[rw]--; + if (priv) + rl->elvpriv--; + + __freed_request(q, rw); + + if (unlikely(rl->starved[rw ^ 1])) + __freed_request(q, rw ^ 1); +} + +#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) +/* + * Get a free request, queue_lock must be held. + * Returns NULL on failure, with queue_lock held. + * Returns !NULL on success, with queue_lock *not held*. + */ +static struct request *get_request(struct request_queue *q, int rw_flags, + struct bio *bio, gfp_t gfp_mask) +{ + struct request *rq = NULL; + struct request_list *rl = &q->rq; + struct io_context *ioc = NULL; + const int rw = rw_flags & 0x01; + int may_queue, priv; + + may_queue = elv_may_queue(q, rw_flags); + if (may_queue == ELV_MQUEUE_NO) + goto rq_starved; + + if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { + if (rl->count[rw]+1 >= q->nr_requests) { + ioc = current_io_context(GFP_ATOMIC, q->node); + /* + * The queue will fill after this allocation, so set + * it as full, and mark this process as "batching". + * This process will be allowed to complete a batch of + * requests, others will be blocked. + */ + if (!blk_queue_full(q, rw)) { + ioc_set_batching(q, ioc); + blk_set_queue_full(q, rw); + } else { + if (may_queue != ELV_MQUEUE_MUST + && !ioc_batching(q, ioc)) { + /* + * The queue is full and the allocating + * process is not a "batcher", and not + * exempted by the IO scheduler + */ + goto out; + } + } + } + blk_set_queue_congested(q, rw); + } + + /* + * Only allow batching queuers to allocate up to 50% over the defined + * limit of requests, otherwise we could have thousands of requests + * allocated with any setting of ->nr_requests + */ + if (rl->count[rw] >= (3 * q->nr_requests / 2)) + goto out; + + rl->count[rw]++; + rl->starved[rw] = 0; + + priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); + if (priv) + rl->elvpriv++; + + spin_unlock_irq(q->queue_lock); + + rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); + if (unlikely(!rq)) { + /* + * Allocation failed presumably due to memory. Undo anything + * we might have messed up. + * + * Allocating task should really be put onto the front of the + * wait queue, but this is pretty rare. + */ + spin_lock_irq(q->queue_lock); + freed_request(q, rw, priv); + + /* + * in the very unlikely event that allocation failed and no + * requests for this direction was pending, mark us starved + * so that freeing of a request in the other direction will + * notice us. another possible fix would be to split the + * rq mempool into READ and WRITE + */ +rq_starved: + if (unlikely(rl->count[rw] == 0)) + rl->starved[rw] = 1; + + goto out; + } + + /* + * ioc may be NULL here, and ioc_batching will be false. That's + * OK, if the queue is under the request limit then requests need + * not count toward the nr_batch_requests limit. There will always + * be some limit enforced by BLK_BATCH_TIME. + */ + if (ioc_batching(q, ioc)) + ioc->nr_batch_requests--; + + trace_block_getrq(q, bio, rw); +out: + return rq; +} + +/* + * No available requests for this queue, unplug the device and wait for some + * requests to become available. + * + * Called with q->queue_lock held, and returns with it unlocked. + */ +static struct request *get_request_wait(struct request_queue *q, int rw_flags, + struct bio *bio) +{ + const int rw = rw_flags & 0x01; + struct request *rq; + + rq = get_request(q, rw_flags, bio, GFP_NOIO); + while (!rq) { + DEFINE_WAIT(wait); + struct io_context *ioc; + struct request_list *rl = &q->rq; + + prepare_to_wait_exclusive(&rl->wait[rw], &wait, + TASK_UNINTERRUPTIBLE); + + trace_block_sleeprq(q, bio, rw); + + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); + io_schedule(); + + /* + * After sleeping, we become a "batching" process and + * will be able to allocate at least one request, and + * up to a big batch of them for a small period time. + * See ioc_batching, ioc_set_batching + */ + ioc = current_io_context(GFP_NOIO, q->node); + ioc_set_batching(q, ioc); + + spin_lock_irq(q->queue_lock); + finish_wait(&rl->wait[rw], &wait); + + rq = get_request(q, rw_flags, bio, GFP_NOIO); + }; + + return rq; +} + +struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +{ + struct request *rq; + + BUG_ON(rw != READ && rw != WRITE); + + spin_lock_irq(q->queue_lock); + if (gfp_mask & __GFP_WAIT) { + rq = get_request_wait(q, rw, NULL); + } else { + rq = get_request(q, rw, NULL, gfp_mask); + if (!rq) + spin_unlock_irq(q->queue_lock); + } + /* q->queue_lock is unlocked at this point */ + + return rq; +} +EXPORT_SYMBOL(blk_get_request); + +/** + * blk_start_queueing - initiate dispatch of requests to device + * @q: request queue to kick into gear + * + * This is basically a helper to remove the need to know whether a queue + * is plugged or not if someone just wants to initiate dispatch of requests + * for this queue. Should be used to start queueing on a device outside + * of ->request_fn() context. Also see @blk_run_queue. + * + * The queue lock must be held with interrupts disabled. + */ +void blk_start_queueing(struct request_queue *q) +{ + if (!blk_queue_plugged(q)) { + if (unlikely(blk_queue_stopped(q))) + return; + q->request_fn(q); + } else + __generic_unplug_device(q); +} +EXPORT_SYMBOL(blk_start_queueing); + +/** + * blk_requeue_request - put a request back on queue + * @q: request queue where request should be inserted + * @rq: request to be inserted + * + * Description: + * Drivers often keep queueing requests until the hardware cannot accept + * more, when that condition happens we need to put the request back + * on the queue. Must be called with queue lock held. + */ +void blk_requeue_request(struct request_queue *q, struct request *rq) +{ + blk_delete_timer(rq); + blk_clear_rq_complete(rq); + trace_block_rq_requeue(q, rq); + + if (blk_rq_tagged(rq)) + blk_queue_end_tag(q, rq); + + elv_requeue_request(q, rq); +} +EXPORT_SYMBOL(blk_requeue_request); + +/** + * blk_insert_request - insert a special request into a request queue + * @q: request queue where request should be inserted + * @rq: request to be inserted + * @at_head: insert request at head or tail of queue + * @data: private data + * + * Description: + * Many block devices need to execute commands asynchronously, so they don't + * block the whole kernel from preemption during request execution. This is + * accomplished normally by inserting aritficial requests tagged as + * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them + * be scheduled for actual execution by the request queue. + * + * We have the option of inserting the head or the tail of the queue. + * Typically we use the tail for new ioctls and so forth. We use the head + * of the queue for things like a QUEUE_FULL message from a device, or a + * host that is unable to accept a particular command. + */ +void blk_insert_request(struct request_queue *q, struct request *rq, + int at_head, void *data) +{ + int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; + unsigned long flags; + + /* + * tell I/O scheduler that this isn't a regular read/write (ie it + * must not attempt merges on this) and that it acts as a soft + * barrier + */ + rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_flags |= REQ_SOFTBARRIER; + + rq->special = data; + + spin_lock_irqsave(q->queue_lock, flags); + + /* + * If command is tagged, release the tag + */ + if (blk_rq_tagged(rq)) + blk_queue_end_tag(q, rq); + + drive_stat_acct(rq, 1); + __elv_add_request(q, rq, where, 0); + blk_start_queueing(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} +EXPORT_SYMBOL(blk_insert_request); + +/* + * add-request adds a request to the linked list. + * queue lock is held and interrupts disabled, as we muck with the + * request queue list. + */ +static inline void add_request(struct request_queue *q, struct request *req) +{ + drive_stat_acct(req, 1); + + /* + * elevator indicated where it wants this request to be + * inserted at elevator_merge time + */ + __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); +} + +static void part_round_stats_single(int cpu, struct hd_struct *part, + unsigned long now) +{ + if (now == part->stamp) + return; + + if (part->in_flight) { + __part_stat_add(cpu, part, time_in_queue, + part->in_flight * (now - part->stamp)); + __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); + } + part->stamp = now; +} + +/** + * part_round_stats() - Round off the performance stats on a struct disk_stats. + * @cpu: cpu number for stats access + * @part: target partition + * + * The average IO queue length and utilisation statistics are maintained + * by observing the current state of the queue length and the amount of + * time it has been in this state for. + * + * Normally, that accounting is done on IO completion, but that can result + * in more than a second's worth of IO being accounted for within any one + * second, leading to >100% utilisation. To deal with that, we call this + * function to do a round-off before returning the results when reading + * /proc/diskstats. This accounts immediately for all queue usage up to + * the current jiffies and restarts the counters again. + */ +void part_round_stats(int cpu, struct hd_struct *part) +{ + unsigned long now = jiffies; + + if (part->partno) + part_round_stats_single(cpu, &part_to_disk(part)->part0, now); + part_round_stats_single(cpu, part, now); +} +EXPORT_SYMBOL_GPL(part_round_stats); + +/* + * queue lock must be held + */ +void __blk_put_request(struct request_queue *q, struct request *req) +{ + if (unlikely(!q)) + return; + if (unlikely(--req->ref_count)) + return; + + elv_completed_request(q, req); + + /* + * Request may not have originated from ll_rw_blk. if not, + * it didn't come out of our reserved rq pools + */ + if (req->cmd_flags & REQ_ALLOCED) { + int rw = rq_data_dir(req); + int priv = req->cmd_flags & REQ_ELVPRIV; + + BUG_ON(!list_empty(&req->queuelist)); + BUG_ON(!hlist_unhashed(&req->hash)); + + blk_free_request(q, req); + freed_request(q, rw, priv); + } +} +EXPORT_SYMBOL_GPL(__blk_put_request); + +void blk_put_request(struct request *req) +{ + unsigned long flags; + struct request_queue *q = req->q; + + spin_lock_irqsave(q->queue_lock, flags); + __blk_put_request(q, req); + spin_unlock_irqrestore(q->queue_lock, flags); +} +EXPORT_SYMBOL(blk_put_request); + +void init_request_from_bio(struct request *req, struct bio *bio) +{ + req->cpu = bio->bi_comp_cpu; + req->cmd_type = REQ_TYPE_FS; + + /* + * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) + */ + if (bio_rw_ahead(bio)) + req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | + REQ_FAILFAST_DRIVER); + if (bio_failfast_dev(bio)) + req->cmd_flags |= REQ_FAILFAST_DEV; + if (bio_failfast_transport(bio)) + req->cmd_flags |= REQ_FAILFAST_TRANSPORT; + if (bio_failfast_driver(bio)) + req->cmd_flags |= REQ_FAILFAST_DRIVER; + + /* + * REQ_BARRIER implies no merging, but lets make it explicit + */ + if (unlikely(bio_discard(bio))) { + req->cmd_flags |= REQ_DISCARD; + if (bio_barrier(bio)) + req->cmd_flags |= REQ_SOFTBARRIER; + req->q->prepare_discard_fn(req->q, req); + } else if (unlikely(bio_barrier(bio))) + req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); + + if (bio_sync(bio)) + req->cmd_flags |= REQ_RW_SYNC; + if (bio_unplug(bio)) + req->cmd_flags |= REQ_UNPLUG; + if (bio_rw_meta(bio)) + req->cmd_flags |= REQ_RW_META; + + req->errors = 0; + req->hard_sector = req->sector = bio->bi_sector; + req->ioprio = bio_prio(bio); + req->start_time = jiffies; + blk_rq_bio_prep(req->q, req, bio); +} + +static int __make_request(struct request_queue *q, struct bio *bio) +{ + struct request *req; + int el_ret, nr_sectors; + const unsigned short prio = bio_prio(bio); + const int sync = bio_sync(bio); + const int unplug = bio_unplug(bio); + int rw_flags; + + nr_sectors = bio_sectors(bio); + + /* + * low level driver can indicate that it wants pages above a + * certain limit bounced to low memory (ie for highmem, or even + * ISA dma in theory) + */ + blk_queue_bounce(q, &bio); + + spin_lock_irq(q->queue_lock); + + if (unlikely(bio_barrier(bio)) || elv_queue_empty(q)) + goto get_rq; + + el_ret = elv_merge(q, &req, bio); + switch (el_ret) { + case ELEVATOR_BACK_MERGE: + BUG_ON(!rq_mergeable(req)); + + if (!ll_back_merge_fn(q, req, bio)) + break; + + trace_block_bio_backmerge(q, bio); + + req->biotail->bi_next = bio; + req->biotail = bio; + req->nr_sectors = req->hard_nr_sectors += nr_sectors; + req->ioprio = ioprio_best(req->ioprio, prio); + if (!blk_rq_cpu_valid(req)) + req->cpu = bio->bi_comp_cpu; + drive_stat_acct(req, 0); + if (!attempt_back_merge(q, req)) + elv_merged_request(q, req, el_ret); + goto out; + + case ELEVATOR_FRONT_MERGE: + BUG_ON(!rq_mergeable(req)); + + if (!ll_front_merge_fn(q, req, bio)) + break; + + trace_block_bio_frontmerge(q, bio); + + bio->bi_next = req->bio; + req->bio = bio; + + /* + * may not be valid. if the low level driver said + * it didn't need a bounce buffer then it better + * not touch req->buffer either... + */ + req->buffer = bio_data(bio); + req->current_nr_sectors = bio_cur_sectors(bio); + req->hard_cur_sectors = req->current_nr_sectors; + req->sector = req->hard_sector = bio->bi_sector; + req->nr_sectors = req->hard_nr_sectors += nr_sectors; + req->ioprio = ioprio_best(req->ioprio, prio); + if (!blk_rq_cpu_valid(req)) + req->cpu = bio->bi_comp_cpu; + drive_stat_acct(req, 0); + if (!attempt_front_merge(q, req)) + elv_merged_request(q, req, el_ret); + goto out; + + /* ELV_NO_MERGE: elevator says don't/can't merge. */ + default: + ; + } + +get_rq: + /* + * This sync check and mask will be re-done in init_request_from_bio(), + * but we need to set it earlier to expose the sync flag to the + * rq allocator and io schedulers. + */ + rw_flags = bio_data_dir(bio); + if (sync) + rw_flags |= REQ_RW_SYNC; + + /* + * Grab a free request. This is might sleep but can not fail. + * Returns with the queue unlocked. + */ + req = get_request_wait(q, rw_flags, bio); + + /* + * After dropping the lock and possibly sleeping here, our request + * may now be mergeable after it had proven unmergeable (above). + * We don't worry about that case for efficiency. It won't happen + * often, and the elevators are able to handle it. + */ + init_request_from_bio(req, bio); + + spin_lock_irq(q->queue_lock); + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || + bio_flagged(bio, BIO_CPU_AFFINE)) + req->cpu = blk_cpu_to_group(smp_processor_id()); + if (!blk_queue_nonrot(q) && elv_queue_empty(q)) + blk_plug_device(q); + add_request(q, req); +out: + if (unplug || blk_queue_nonrot(q)) + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); + return 0; +} + +/* + * If bio->bi_dev is a partition, remap the location + */ +static inline void blk_partition_remap(struct bio *bio) +{ + struct block_device *bdev = bio->bi_bdev; + + if (bio_sectors(bio) && bdev != bdev->bd_contains) { + struct hd_struct *p = bdev->bd_part; + + bio->bi_sector += p->start_sect; + bio->bi_bdev = bdev->bd_contains; + + trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, + bdev->bd_dev, bio->bi_sector, + bio->bi_sector - p->start_sect); + } +} + +static void handle_bad_sector(struct bio *bio) +{ + char b[BDEVNAME_SIZE]; + + printk(KERN_INFO "attempt to access beyond end of device\n"); + printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", + bdevname(bio->bi_bdev, b), + bio->bi_rw, + (unsigned long long)bio->bi_sector + bio_sectors(bio), + (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); + + set_bit(BIO_EOF, &bio->bi_flags); +} + +#ifdef CONFIG_FAIL_MAKE_REQUEST + +static DECLARE_FAULT_ATTR(fail_make_request); + +static int __init setup_fail_make_request(char *str) +{ + return setup_fault_attr(&fail_make_request, str); +} +__setup("fail_make_request=", setup_fail_make_request); + +static int should_fail_request(struct bio *bio) +{ + struct hd_struct *part = bio->bi_bdev->bd_part; + + if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail) + return should_fail(&fail_make_request, bio->bi_size); + + return 0; +} + +static int __init fail_make_request_debugfs(void) +{ + return init_fault_attr_dentries(&fail_make_request, + "fail_make_request"); +} + +late_initcall(fail_make_request_debugfs); + +#else /* CONFIG_FAIL_MAKE_REQUEST */ + +static inline int should_fail_request(struct bio *bio) +{ + return 0; +} + +#endif /* CONFIG_FAIL_MAKE_REQUEST */ + +/* + * Check whether this bio extends beyond the end of the device. + */ +static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) +{ + sector_t maxsector; + + if (!nr_sectors) + return 0; + + /* Test device or partition size, when known. */ + maxsector = bio->bi_bdev->bd_inode->i_size >> 9; + if (maxsector) { + sector_t sector = bio->bi_sector; + + if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { + /* + * This may well happen - the kernel calls bread() + * without checking the size of the device, e.g., when + * mounting a device. + */ + handle_bad_sector(bio); + return 1; + } + } + + return 0; +} + +/** + * generic_make_request - hand a buffer to its device driver for I/O + * @bio: The bio describing the location in memory and on the device. + * + * generic_make_request() is used to make I/O requests of block + * devices. It is passed a &struct bio, which describes the I/O that needs + * to be done. + * + * generic_make_request() does not return any status. The + * success/failure status of the request, along with notification of + * completion, is delivered asynchronously through the bio->bi_end_io + * function described (one day) else where. + * + * The caller of generic_make_request must make sure that bi_io_vec + * are set to describe the memory buffer, and that bi_dev and bi_sector are + * set to describe the device address, and the + * bi_end_io and optionally bi_private are set to describe how + * completion notification should be signaled. + * + * generic_make_request and the drivers it calls may use bi_next if this + * bio happens to be merged with someone else, and may change bi_dev and + * bi_sector for remaps as it sees fit. So the values of these fields + * should NOT be depended on after the call to generic_make_request. + */ +static inline void __generic_make_request(struct bio *bio) +{ + struct request_queue *q; + sector_t old_sector; + int ret, nr_sectors = bio_sectors(bio); + dev_t old_dev; + int err = -EIO; + + might_sleep(); + + if (bio_check_eod(bio, nr_sectors)) + goto end_io; + + /* + * Resolve the mapping until finished. (drivers are + * still free to implement/resolve their own stacking + * by explicitly returning 0) + * + * NOTE: we don't repeat the blk_size check for each new device. + * Stacking drivers are expected to know what they are doing. + */ + old_sector = -1; + old_dev = 0; + do { + char b[BDEVNAME_SIZE]; + + q = bdev_get_queue(bio->bi_bdev); + if (unlikely(!q)) { + printk(KERN_ERR + "generic_make_request: Trying to access " + "nonexistent block-device %s (%Lu)\n", + bdevname(bio->bi_bdev, b), + (long long) bio->bi_sector); + goto end_io; + } + + if (unlikely(nr_sectors > q->max_hw_sectors)) { + printk(KERN_ERR "bio too big device %s (%u > %u)\n", + bdevname(bio->bi_bdev, b), + bio_sectors(bio), + q->max_hw_sectors); + goto end_io; + } + + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + goto end_io; + + if (should_fail_request(bio)) + goto end_io; + + /* + * If this device has partitions, remap block n + * of partition p to block n+start(p) of the disk. + */ + blk_partition_remap(bio); + + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) + goto end_io; + + if (old_sector != -1) + trace_block_remap(q, bio, old_dev, bio->bi_sector, + old_sector); + + trace_block_bio_queue(q, bio); + + old_sector = bio->bi_sector; + old_dev = bio->bi_bdev->bd_dev; + + if (bio_check_eod(bio, nr_sectors)) + goto end_io; + + if (bio_discard(bio) && !q->prepare_discard_fn) { + err = -EOPNOTSUPP; + goto end_io; + } + if (bio_barrier(bio) && bio_has_data(bio) && + (q->next_ordered == QUEUE_ORDERED_NONE)) { + err = -EOPNOTSUPP; + goto end_io; + } + + ret = q->make_request_fn(q, bio); + } while (ret); + + return; + +end_io: + bio_endio(bio, err); +} + +/* + * We only want one ->make_request_fn to be active at a time, + * else stack usage with stacked devices could be a problem. + * So use current->bio_{list,tail} to keep a list of requests + * submited by a make_request_fn function. + * current->bio_tail is also used as a flag to say if + * generic_make_request is currently active in this task or not. + * If it is NULL, then no make_request is active. If it is non-NULL, + * then a make_request is active, and new requests should be added + * at the tail + */ +void generic_make_request(struct bio *bio) +{ + if (current->bio_tail) { + /* make_request is active */ + *(current->bio_tail) = bio; + bio->bi_next = NULL; + current->bio_tail = &bio->bi_next; + return; + } + /* following loop may be a bit non-obvious, and so deserves some + * explanation. + * Before entering the loop, bio->bi_next is NULL (as all callers + * ensure that) so we have a list with a single bio. + * We pretend that we have just taken it off a longer list, so + * we assign bio_list to the next (which is NULL) and bio_tail + * to &bio_list, thus initialising the bio_list of new bios to be + * added. __generic_make_request may indeed add some more bios + * through a recursive call to generic_make_request. If it + * did, we find a non-NULL value in bio_list and re-enter the loop + * from the top. In this case we really did just take the bio + * of the top of the list (no pretending) and so fixup bio_list and + * bio_tail or bi_next, and call into __generic_make_request again. + * + * The loop was structured like this to make only one call to + * __generic_make_request (which is important as it is large and + * inlined) and to keep the structure simple. + */ + BUG_ON(bio->bi_next); + do { + current->bio_list = bio->bi_next; + if (bio->bi_next == NULL) + current->bio_tail = ¤t->bio_list; + else + bio->bi_next = NULL; + __generic_make_request(bio); + bio = current->bio_list; + } while (bio); + current->bio_tail = NULL; /* deactivate */ +} +EXPORT_SYMBOL(generic_make_request); + +/** + * submit_bio - submit a bio to the block device layer for I/O + * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) + * @bio: The &struct bio which describes the I/O + * + * submit_bio() is very similar in purpose to generic_make_request(), and + * uses that function to do most of the work. Both are fairly rough + * interfaces; @bio must be presetup and ready for I/O. + * + */ +void submit_bio(int rw, struct bio *bio) +{ + int count = bio_sectors(bio); + + bio->bi_rw |= rw; + + /* + * If it's a regular read/write or a barrier with data attached, + * go through the normal accounting stuff before submission. + */ + if (bio_has_data(bio)) { + if (rw & WRITE) { + count_vm_events(PGPGOUT, count); + } else { + task_io_account_read(bio->bi_size); + count_vm_events(PGPGIN, count); + } + + if (unlikely(block_dump)) { + char b[BDEVNAME_SIZE]; + printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", + current->comm, task_pid_nr(current), + (rw & WRITE) ? "WRITE" : "READ", + (unsigned long long)bio->bi_sector, + bdevname(bio->bi_bdev, b)); + } + } + + generic_make_request(bio); +} +EXPORT_SYMBOL(submit_bio); + +/** + * blk_rq_check_limits - Helper function to check a request for the queue limit + * @q: the queue + * @rq: the request being checked + * + * Description: + * @rq may have been made based on weaker limitations of upper-level queues + * in request stacking drivers, and it may violate the limitation of @q. + * Since the block layer and the underlying device driver trust @rq + * after it is inserted to @q, it should be checked against @q before + * the insertion using this generic function. + * + * This function should also be useful for request stacking drivers + * in some cases below, so export this fuction. + * Request stacking drivers like request-based dm may change the queue + * limits while requests are in the queue (e.g. dm's table swapping). + * Such request stacking drivers should check those requests agaist + * the new queue limits again when they dispatch those requests, + * although such checkings are also done against the old queue limits + * when submitting requests. + */ +int blk_rq_check_limits(struct request_queue *q, struct request *rq) +{ + if (rq->nr_sectors > q->max_sectors || + rq->data_len > q->max_hw_sectors << 9) { + printk(KERN_ERR "%s: over max size limit.\n", __func__); + return -EIO; + } + + /* + * queue's settings related to segment counting like q->bounce_pfn + * may differ from that of other stacking queues. + * Recalculate it to check the request correctly on this queue's + * limitation. + */ + blk_recalc_rq_segments(rq); + if (rq->nr_phys_segments > q->max_phys_segments || + rq->nr_phys_segments > q->max_hw_segments) { + printk(KERN_ERR "%s: over max segments limit.\n", __func__); + return -EIO; + } + + return 0; +} +EXPORT_SYMBOL_GPL(blk_rq_check_limits); + +/** + * blk_insert_cloned_request - Helper for stacking drivers to submit a request + * @q: the queue to submit the request + * @rq: the request being queued + */ +int blk_insert_cloned_request(struct request_queue *q, struct request *rq) +{ + unsigned long flags; + + if (blk_rq_check_limits(q, rq)) + return -EIO; + +#ifdef CONFIG_FAIL_MAKE_REQUEST + if (rq->rq_disk && rq->rq_disk->part0.make_it_fail && + should_fail(&fail_make_request, blk_rq_bytes(rq))) + return -EIO; +#endif + + spin_lock_irqsave(q->queue_lock, flags); + + /* + * Submitting request must be dequeued before calling this function + * because it will be linked to another request_queue + */ + BUG_ON(blk_queued_rq(rq)); + + drive_stat_acct(rq, 1); + __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); + + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; +} +EXPORT_SYMBOL_GPL(blk_insert_cloned_request); + +/** + * blkdev_dequeue_request - dequeue request and start timeout timer + * @req: request to dequeue + * + * Dequeue @req and start timeout timer on it. This hands off the + * request to the driver. + * + * Block internal functions which don't want to start timer should + * call elv_dequeue_request(). + */ +void blkdev_dequeue_request(struct request *req) +{ + elv_dequeue_request(req->q, req); + + /* + * We are now handing the request to the hardware, add the + * timeout handler. + */ + blk_add_timer(req); +} +EXPORT_SYMBOL(blkdev_dequeue_request); + +static void blk_account_io_completion(struct request *req, unsigned int bytes) +{ + struct gendisk *disk = req->rq_disk; + + if (!disk || !blk_do_io_stat(disk->queue)) + return; + + if (blk_fs_request(req)) { + const int rw = rq_data_dir(req); + struct hd_struct *part; + int cpu; + + cpu = part_stat_lock(); + part = disk_map_sector_rcu(req->rq_disk, req->sector); + part_stat_add(cpu, part, sectors[rw], bytes >> 9); + part_stat_unlock(); + } +} + +static void blk_account_io_done(struct request *req) +{ + struct gendisk *disk = req->rq_disk; + + if (!disk || !blk_do_io_stat(disk->queue)) + return; + + /* + * Account IO completion. bar_rq isn't accounted as a normal + * IO on queueing nor completion. Accounting the containing + * request is enough. + */ + if (blk_fs_request(req) && req != &req->q->bar_rq) { + unsigned long duration = jiffies - req->start_time; + const int rw = rq_data_dir(req); + struct hd_struct *part; + int cpu; + + cpu = part_stat_lock(); + part = disk_map_sector_rcu(disk, req->sector); + + part_stat_inc(cpu, part, ios[rw]); + part_stat_add(cpu, part, ticks[rw], duration); + part_round_stats(cpu, part); + part_dec_in_flight(part); + + part_stat_unlock(); + } +} + +/** + * __end_that_request_first - end I/O on a request + * @req: the request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete + * + * Description: + * Ends I/O on a number of bytes attached to @req, and sets it up + * for the next range of segments (if any) in the cluster. + * + * Return: + * %0 - we are done with this request, call end_that_request_last() + * %1 - still buffers pending for this request + **/ +static int __end_that_request_first(struct request *req, int error, + int nr_bytes) +{ + int total_bytes, bio_nbytes, next_idx = 0; + struct bio *bio; + + trace_block_rq_complete(req->q, req); + + /* + * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual + * sense key with us all the way through + */ + if (!blk_pc_request(req)) + req->errors = 0; + + if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { + printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", + req->rq_disk ? req->rq_disk->disk_name : "?", + (unsigned long long)req->sector); + } + + blk_account_io_completion(req, nr_bytes); + + total_bytes = bio_nbytes = 0; + while ((bio = req->bio) != NULL) { + int nbytes; + + if (nr_bytes >= bio->bi_size) { + req->bio = bio->bi_next; + nbytes = bio->bi_size; + req_bio_endio(req, bio, nbytes, error); + next_idx = 0; + bio_nbytes = 0; + } else { + int idx = bio->bi_idx + next_idx; + + if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { + blk_dump_rq_flags(req, "__end_that"); + printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n", + __func__, bio->bi_idx, bio->bi_vcnt); + break; + } + + nbytes = bio_iovec_idx(bio, idx)->bv_len; + BIO_BUG_ON(nbytes > bio->bi_size); + + /* + * not a complete bvec done + */ + if (unlikely(nbytes > nr_bytes)) { + bio_nbytes += nr_bytes; + total_bytes += nr_bytes; + break; + } + + /* + * advance to the next vector + */ + next_idx++; + bio_nbytes += nbytes; + } + + total_bytes += nbytes; + nr_bytes -= nbytes; + + bio = req->bio; + if (bio) { + /* + * end more in this run, or just return 'not-done' + */ + if (unlikely(nr_bytes <= 0)) + break; + } + } + + /* + * completely done + */ + if (!req->bio) + return 0; + + /* + * if the request wasn't completed, update state + */ + if (bio_nbytes) { + req_bio_endio(req, bio, bio_nbytes, error); + bio->bi_idx += next_idx; + bio_iovec(bio)->bv_offset += nr_bytes; + bio_iovec(bio)->bv_len -= nr_bytes; + } + + blk_recalc_rq_sectors(req, total_bytes >> 9); + blk_recalc_rq_segments(req); + return 1; +} + +/* + * queue lock must be held + */ +static void end_that_request_last(struct request *req, int error) +{ + if (blk_rq_tagged(req)) + blk_queue_end_tag(req->q, req); + + if (blk_queued_rq(req)) + elv_dequeue_request(req->q, req); + +#ifndef DDE_LINUX + if (unlikely(laptop_mode) && blk_fs_request(req)) + laptop_io_completion(); +#endif + + blk_delete_timer(req); + + blk_account_io_done(req); + + if (req->end_io) + req->end_io(req, error); + else { + if (blk_bidi_rq(req)) + __blk_put_request(req->next_rq->q, req->next_rq); + + __blk_put_request(req->q, req); + } +} + +/** + * blk_rq_bytes - Returns bytes left to complete in the entire request + * @rq: the request being processed + **/ +unsigned int blk_rq_bytes(struct request *rq) +{ + if (blk_fs_request(rq)) + return rq->hard_nr_sectors << 9; + + return rq->data_len; +} +EXPORT_SYMBOL_GPL(blk_rq_bytes); + +/** + * blk_rq_cur_bytes - Returns bytes left to complete in the current segment + * @rq: the request being processed + **/ +unsigned int blk_rq_cur_bytes(struct request *rq) +{ + if (blk_fs_request(rq)) + return rq->current_nr_sectors << 9; + + if (rq->bio) + return rq->bio->bi_size; + + return rq->data_len; +} +EXPORT_SYMBOL_GPL(blk_rq_cur_bytes); + +/** + * end_request - end I/O on the current segment of the request + * @req: the request being processed + * @uptodate: error value or %0/%1 uptodate flag + * + * Description: + * Ends I/O on the current segment of a request. If that is the only + * remaining segment, the request is also completed and freed. + * + * This is a remnant of how older block drivers handled I/O completions. + * Modern drivers typically end I/O on the full request in one go, unless + * they have a residual value to account for. For that case this function + * isn't really useful, unless the residual just happens to be the + * full current segment. In other words, don't use this function in new + * code. Use blk_end_request() or __blk_end_request() to end a request. + **/ +void end_request(struct request *req, int uptodate) +{ + int error = 0; + + if (uptodate <= 0) + error = uptodate ? uptodate : -EIO; + + __blk_end_request(req, error, req->hard_cur_sectors << 9); +} +EXPORT_SYMBOL(end_request); + +static int end_that_request_data(struct request *rq, int error, + unsigned int nr_bytes, unsigned int bidi_bytes) +{ + if (rq->bio) { + if (__end_that_request_first(rq, error, nr_bytes)) + return 1; + + /* Bidi request must be completed as a whole */ + if (blk_bidi_rq(rq) && + __end_that_request_first(rq->next_rq, error, bidi_bytes)) + return 1; + } + + return 0; +} + +/** + * blk_end_io - Generic end_io function to complete a request. + * @rq: the request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete @rq + * @bidi_bytes: number of bytes to complete @rq->next_rq + * @drv_callback: function called between completion of bios in the request + * and completion of the request. + * If the callback returns non %0, this helper returns without + * completion of the request. + * + * Description: + * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. + * If @rq has leftover, sets it up for the next range of segments. + * + * Return: + * %0 - we are done with this request + * %1 - this request is not freed yet, it still has pending buffers. + **/ +static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, + unsigned int bidi_bytes, + int (drv_callback)(struct request *)) +{ + struct request_queue *q = rq->q; + unsigned long flags = 0UL; + + if (end_that_request_data(rq, error, nr_bytes, bidi_bytes)) + return 1; + + /* Special feature for tricky drivers */ + if (drv_callback && drv_callback(rq)) + return 1; + +#ifndef DDE_LINUX + add_disk_randomness(rq->rq_disk); +#endif + + spin_lock_irqsave(q->queue_lock, flags); + end_that_request_last(rq, error); + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; +} + +/** + * blk_end_request - Helper function for drivers to complete the request. + * @rq: the request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete + * + * Description: + * Ends I/O on a number of bytes attached to @rq. + * If @rq has leftover, sets it up for the next range of segments. + * + * Return: + * %0 - we are done with this request + * %1 - still buffers pending for this request + **/ +int blk_end_request(struct request *rq, int error, unsigned int nr_bytes) +{ + return blk_end_io(rq, error, nr_bytes, 0, NULL); +} +EXPORT_SYMBOL_GPL(blk_end_request); + +/** + * __blk_end_request - Helper function for drivers to complete the request. + * @rq: the request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete + * + * Description: + * Must be called with queue lock held unlike blk_end_request(). + * + * Return: + * %0 - we are done with this request + * %1 - still buffers pending for this request + **/ +int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) +{ + if (rq->bio && __end_that_request_first(rq, error, nr_bytes)) + return 1; + +#ifndef DDE_LINUX + add_disk_randomness(rq->rq_disk); +#endif + + end_that_request_last(rq, error); + + return 0; +} +EXPORT_SYMBOL_GPL(__blk_end_request); + +/** + * blk_end_bidi_request - Helper function for drivers to complete bidi request. + * @rq: the bidi request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete @rq + * @bidi_bytes: number of bytes to complete @rq->next_rq + * + * Description: + * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. + * + * Return: + * %0 - we are done with this request + * %1 - still buffers pending for this request + **/ +int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, + unsigned int bidi_bytes) +{ + return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL); +} +EXPORT_SYMBOL_GPL(blk_end_bidi_request); + +/** + * blk_update_request - Special helper function for request stacking drivers + * @rq: the request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete @rq + * + * Description: + * Ends I/O on a number of bytes attached to @rq, but doesn't complete + * the request structure even if @rq doesn't have leftover. + * If @rq has leftover, sets it up for the next range of segments. + * + * This special helper function is only for request stacking drivers + * (e.g. request-based dm) so that they can handle partial completion. + * Actual device drivers should use blk_end_request instead. + */ +void blk_update_request(struct request *rq, int error, unsigned int nr_bytes) +{ + if (!end_that_request_data(rq, error, nr_bytes, 0)) { + /* + * These members are not updated in end_that_request_data() + * when all bios are completed. + * Update them so that the request stacking driver can find + * how many bytes remain in the request later. + */ + rq->nr_sectors = rq->hard_nr_sectors = 0; + rq->current_nr_sectors = rq->hard_cur_sectors = 0; + } +} +EXPORT_SYMBOL_GPL(blk_update_request); + +/** + * blk_end_request_callback - Special helper function for tricky drivers + * @rq: the request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete + * @drv_callback: function called between completion of bios in the request + * and completion of the request. + * If the callback returns non %0, this helper returns without + * completion of the request. + * + * Description: + * Ends I/O on a number of bytes attached to @rq. + * If @rq has leftover, sets it up for the next range of segments. + * + * This special helper function is used only for existing tricky drivers. + * (e.g. cdrom_newpc_intr() of ide-cd) + * This interface will be removed when such drivers are rewritten. + * Don't use this interface in other places anymore. + * + * Return: + * %0 - we are done with this request + * %1 - this request is not freed yet. + * this request still has pending buffers or + * the driver doesn't want to finish this request yet. + **/ +int blk_end_request_callback(struct request *rq, int error, + unsigned int nr_bytes, + int (drv_callback)(struct request *)) +{ + return blk_end_io(rq, error, nr_bytes, 0, drv_callback); +} +EXPORT_SYMBOL_GPL(blk_end_request_callback); + +void blk_rq_bio_prep(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and + we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */ + rq->cmd_flags |= (bio->bi_rw & 3); + + if (bio_has_data(bio)) { + rq->nr_phys_segments = bio_phys_segments(q, bio); + rq->buffer = bio_data(bio); + } + rq->current_nr_sectors = bio_cur_sectors(bio); + rq->hard_cur_sectors = rq->current_nr_sectors; + rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); + rq->data_len = bio->bi_size; + + rq->bio = rq->biotail = bio; + + if (bio->bi_bdev) + rq->rq_disk = bio->bi_bdev->bd_disk; +} + +/** + * blk_lld_busy - Check if underlying low-level drivers of a device are busy + * @q : the queue of the device being checked + * + * Description: + * Check if underlying low-level drivers of a device are busy. + * If the drivers want to export their busy state, they must set own + * exporting function using blk_queue_lld_busy() first. + * + * Basically, this function is used only by request stacking drivers + * to stop dispatching requests to underlying devices when underlying + * devices are busy. This behavior helps more I/O merging on the queue + * of the request stacking driver and prevents I/O throughput regression + * on burst I/O load. + * + * Return: + * 0 - Not busy (The request stacking driver should dispatch request) + * 1 - Busy (The request stacking driver should stop dispatching request) + */ +int blk_lld_busy(struct request_queue *q) +{ + if (q->lld_busy_fn) + return q->lld_busy_fn(q); + + return 0; +} +EXPORT_SYMBOL_GPL(blk_lld_busy); + +int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) +{ + return schedule_work (work); +} +EXPORT_SYMBOL(kblockd_schedule_work); + +int __init blk_dev_init(void) +{ + kblockd_workqueue = create_workqueue("kblockd"); + if (!kblockd_workqueue) + panic("Failed to create kblockd\n"); + + request_cachep = kmem_cache_create("blkdev_requests", + sizeof(struct request), 0, SLAB_PANIC, NULL); + + blk_requestq_cachep = kmem_cache_create("blkdev_queue", + sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + + return 0; +} + diff --git a/libdde-linux26/lib/src/block/blk.h b/libdde-linux26/lib/src/block/blk.h new file mode 100644 index 00000000..0dce92c3 --- /dev/null +++ b/libdde-linux26/lib/src/block/blk.h @@ -0,0 +1,119 @@ +#ifndef BLK_INTERNAL_H +#define BLK_INTERNAL_H + +/* Amount of time in which a process may batch requests */ +#define BLK_BATCH_TIME (HZ/50UL) + +/* Number of requests a "batching" process may submit */ +#define BLK_BATCH_REQ 32 + +extern struct kmem_cache *blk_requestq_cachep; +extern struct kobj_type blk_queue_ktype; + +void init_request_from_bio(struct request *req, struct bio *bio); +void blk_rq_bio_prep(struct request_queue *q, struct request *rq, + struct bio *bio); +void __blk_queue_free_tags(struct request_queue *q); + +void blk_unplug_work(struct work_struct *work); +void blk_unplug_timeout(unsigned long data); +void blk_rq_timed_out_timer(unsigned long data); +void blk_delete_timer(struct request *); +void blk_add_timer(struct request *); +void __generic_unplug_device(struct request_queue *); + +/* + * Internal atomic flags for request handling + */ +enum rq_atomic_flags { + REQ_ATOM_COMPLETE = 0, +}; + +/* + * EH timer and IO completion will both attempt to 'grab' the request, make + * sure that only one of them suceeds + */ +static inline int blk_mark_rq_complete(struct request *rq) +{ + return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); +} + +static inline void blk_clear_rq_complete(struct request *rq) +{ + clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); +} + +#ifdef CONFIG_FAIL_IO_TIMEOUT +int blk_should_fake_timeout(struct request_queue *); +ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); +ssize_t part_timeout_store(struct device *, struct device_attribute *, + const char *, size_t); +#else +static inline int blk_should_fake_timeout(struct request_queue *q) +{ + return 0; +} +#endif + +struct io_context *current_io_context(gfp_t gfp_flags, int node); + +int ll_back_merge_fn(struct request_queue *q, struct request *req, + struct bio *bio); +int ll_front_merge_fn(struct request_queue *q, struct request *req, + struct bio *bio); +int attempt_back_merge(struct request_queue *q, struct request *rq); +int attempt_front_merge(struct request_queue *q, struct request *rq); +void blk_recalc_rq_segments(struct request *rq); +void blk_recalc_rq_sectors(struct request *rq, int nsect); + +void blk_queue_congestion_threshold(struct request_queue *q); + +int blk_dev_init(void); + +/* + * Return the threshold (number of used requests) at which the queue is + * considered to be congested. It include a little hysteresis to keep the + * context switch rate down. + */ +static inline int queue_congestion_on_threshold(struct request_queue *q) +{ + return q->nr_congestion_on; +} + +/* + * The threshold at which a queue is considered to be uncongested + */ +static inline int queue_congestion_off_threshold(struct request_queue *q) +{ + return q->nr_congestion_off; +} + +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +#define rq_for_each_integrity_segment(bvl, _rq, _iter) \ + __rq_for_each_bio(_iter.bio, _rq) \ + bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i) + +#endif /* BLK_DEV_INTEGRITY */ + +static inline int blk_cpu_to_group(int cpu) +{ +#ifdef CONFIG_SCHED_MC + const struct cpumask *mask = cpu_coregroup_mask(cpu); + return cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) + return first_cpu(per_cpu(cpu_sibling_map, cpu)); +#else + return cpu; +#endif +} + +static inline int blk_do_io_stat(struct request_queue *q) +{ + if (q) + return blk_queue_io_stat(q); + + return 0; +} + +#endif diff --git a/libdde-linux26/lib/src/block/genhd.c b/libdde-linux26/lib/src/block/genhd.c new file mode 100644 index 00000000..921cebff --- /dev/null +++ b/libdde-linux26/lib/src/block/genhd.c @@ -0,0 +1,1248 @@ +/* + * gendisk handling + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/kdev_t.h> +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/kobj_map.h> +#include <linux/buffer_head.h> +#include <linux/mutex.h> +#include <linux/idr.h> + +#include "blk.h" +#ifdef DDE_LINUX +#include "local.h" +#endif + +static DEFINE_MUTEX(block_class_lock); +#ifndef CONFIG_SYSFS_DEPRECATED +struct kobject *block_depr; +#endif + +/* for extended dynamic devt allocation, currently only one major is used */ +#define MAX_EXT_DEVT (1 << MINORBITS) + +/* For extended devt allocation. ext_devt_mutex prevents look up + * results from going away underneath its user. + */ +static DEFINE_MUTEX(ext_devt_mutex); +static DEFINE_IDR(ext_devt_idr); + +static struct device_type disk_type; + +/** + * disk_get_part - get partition + * @disk: disk to look partition from + * @partno: partition number + * + * Look for partition @partno from @disk. If found, increment + * reference count and return it. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * Pointer to the found partition on success, NULL if not found. + */ +struct hd_struct *disk_get_part(struct gendisk *disk, int partno) +{ + struct hd_struct *part = NULL; + struct disk_part_tbl *ptbl; + + if (unlikely(partno < 0)) + return NULL; + + rcu_read_lock(); + + ptbl = rcu_dereference(disk->part_tbl); + if (likely(partno < ptbl->len)) { + part = rcu_dereference(ptbl->part[partno]); + if (part) + get_device(part_to_dev(part)); + } + + rcu_read_unlock(); + + return part; +} +EXPORT_SYMBOL_GPL(disk_get_part); + +/** + * disk_part_iter_init - initialize partition iterator + * @piter: iterator to initialize + * @disk: disk to iterate over + * @flags: DISK_PITER_* flags + * + * Initialize @piter so that it iterates over partitions of @disk. + * + * CONTEXT: + * Don't care. + */ +void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, + unsigned int flags) +{ + struct disk_part_tbl *ptbl; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + + piter->disk = disk; + piter->part = NULL; + + if (flags & DISK_PITER_REVERSE) + piter->idx = ptbl->len - 1; + else if (flags & DISK_PITER_INCL_PART0) + piter->idx = 0; + else + piter->idx = 1; + + piter->flags = flags; + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(disk_part_iter_init); + +/** + * disk_part_iter_next - proceed iterator to the next partition and return it + * @piter: iterator of interest + * + * Proceed @piter to the next partition and return it. + * + * CONTEXT: + * Don't care. + */ +struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) +{ + struct disk_part_tbl *ptbl; + int inc, end; + + /* put the last partition */ + disk_put_part(piter->part); + piter->part = NULL; + + /* get part_tbl */ + rcu_read_lock(); + ptbl = rcu_dereference(piter->disk->part_tbl); + + /* determine iteration parameters */ + if (piter->flags & DISK_PITER_REVERSE) { + inc = -1; + if (piter->flags & DISK_PITER_INCL_PART0) + end = -1; + else + end = 0; + } else { + inc = 1; + end = ptbl->len; + } + + /* iterate to the next partition */ + for (; piter->idx != end; piter->idx += inc) { + struct hd_struct *part; + + part = rcu_dereference(ptbl->part[piter->idx]); + if (!part) + continue; + if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects) + continue; + + get_device(part_to_dev(part)); + piter->part = part; + piter->idx += inc; + break; + } + + rcu_read_unlock(); + + return piter->part; +} +EXPORT_SYMBOL_GPL(disk_part_iter_next); + +/** + * disk_part_iter_exit - finish up partition iteration + * @piter: iter of interest + * + * Called when iteration is over. Cleans up @piter. + * + * CONTEXT: + * Don't care. + */ +void disk_part_iter_exit(struct disk_part_iter *piter) +{ + disk_put_part(piter->part); + piter->part = NULL; +} +EXPORT_SYMBOL_GPL(disk_part_iter_exit); + +static inline int sector_in_part(struct hd_struct *part, sector_t sector) +{ + return part->start_sect <= sector && + sector < part->start_sect + part->nr_sects; +} + +/** + * disk_map_sector_rcu - map sector to partition + * @disk: gendisk of interest + * @sector: sector to map + * + * Find out which partition @sector maps to on @disk. This is + * primarily used for stats accounting. + * + * CONTEXT: + * RCU read locked. The returned partition pointer is valid only + * while preemption is disabled. + * + * RETURNS: + * Found partition on success, part0 is returned if no partition matches + */ +struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) +{ + struct disk_part_tbl *ptbl; + struct hd_struct *part; + int i; + + ptbl = rcu_dereference(disk->part_tbl); + + part = rcu_dereference(ptbl->last_lookup); + if (part && sector_in_part(part, sector)) + return part; + + for (i = 1; i < ptbl->len; i++) { + part = rcu_dereference(ptbl->part[i]); + + if (part && sector_in_part(part, sector)) { + rcu_assign_pointer(ptbl->last_lookup, part); + return part; + } + } + return &disk->part0; +} +EXPORT_SYMBOL_GPL(disk_map_sector_rcu); + +/* + * Can be deleted altogether. Later. + * + */ +static struct blk_major_name { + struct blk_major_name *next; + int major; + char name[16]; +} *major_names[BLKDEV_MAJOR_HASH_SIZE]; + +/* index in the above - for now: assume no multimajor ranges */ +static inline int major_to_index(int major) +{ + return major % BLKDEV_MAJOR_HASH_SIZE; +} + +#ifdef CONFIG_PROC_FS +void blkdev_show(struct seq_file *seqf, off_t offset) +{ + struct blk_major_name *dp; + + if (offset < BLKDEV_MAJOR_HASH_SIZE) { + mutex_lock(&block_class_lock); + for (dp = major_names[offset]; dp; dp = dp->next) + seq_printf(seqf, "%3d %s\n", dp->major, dp->name); + mutex_unlock(&block_class_lock); + } +} +#endif /* CONFIG_PROC_FS */ + +/** + * register_blkdev - register a new block device + * + * @major: the requested major device number [1..255]. If @major=0, try to + * allocate any unused major number. + * @name: the name of the new block device as a zero terminated string + * + * The @name must be unique within the system. + * + * The return value depends on the @major input parameter. + * - if a major device number was requested in range [1..255] then the + * function returns zero on success, or a negative error code + * - if any unused major number was requested with @major=0 parameter + * then the return value is the allocated major number in range + * [1..255] or a negative error code otherwise + */ +int register_blkdev(unsigned int major, const char *name) +{ + struct blk_major_name **n, *p; + int index, ret = 0; + + mutex_lock(&block_class_lock); + + /* temporary */ + if (major == 0) { + for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) { + if (major_names[index] == NULL) + break; + } + + if (index == 0) { + printk("register_blkdev: failed to get major for %s\n", + name); + ret = -EBUSY; + goto out; + } + major = index; + ret = major; + } + + p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL); + if (p == NULL) { + ret = -ENOMEM; + goto out; + } + + p->major = major; + strlcpy(p->name, name, sizeof(p->name)); + p->next = NULL; + index = major_to_index(major); + + for (n = &major_names[index]; *n; n = &(*n)->next) { + if ((*n)->major == major) + break; + } + if (!*n) + *n = p; + else + ret = -EBUSY; + + if (ret < 0) { + printk("register_blkdev: cannot get major %d for %s\n", + major, name); + kfree(p); + } +out: + mutex_unlock(&block_class_lock); + return ret; +} + +EXPORT_SYMBOL(register_blkdev); + +void unregister_blkdev(unsigned int major, const char *name) +{ + struct blk_major_name **n; + struct blk_major_name *p = NULL; + int index = major_to_index(major); + + mutex_lock(&block_class_lock); + for (n = &major_names[index]; *n; n = &(*n)->next) + if ((*n)->major == major) + break; + if (!*n || strcmp((*n)->name, name)) { + WARN_ON(1); + } else { + p = *n; + *n = p->next; + } + mutex_unlock(&block_class_lock); + kfree(p); +} + +EXPORT_SYMBOL(unregister_blkdev); + +static struct kobj_map *bdev_map; + +/** + * blk_mangle_minor - scatter minor numbers apart + * @minor: minor number to mangle + * + * Scatter consecutively allocated @minor number apart if MANGLE_DEVT + * is enabled. Mangling twice gives the original value. + * + * RETURNS: + * Mangled value. + * + * CONTEXT: + * Don't care. + */ +static int blk_mangle_minor(int minor) +{ +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + int i; + + for (i = 0; i < MINORBITS / 2; i++) { + int low = minor & (1 << i); + int high = minor & (1 << (MINORBITS - 1 - i)); + int distance = MINORBITS - 1 - 2 * i; + + minor ^= low | high; /* clear both bits */ + low <<= distance; /* swap the positions */ + high >>= distance; + minor |= low | high; /* and set */ + } +#endif + return minor; +} + +/** + * blk_alloc_devt - allocate a dev_t for a partition + * @part: partition to allocate dev_t for + * @devt: out parameter for resulting dev_t + * + * Allocate a dev_t for block device. + * + * RETURNS: + * 0 on success, allocated dev_t is returned in *@devt. -errno on + * failure. + * + * CONTEXT: + * Might sleep. + */ +int blk_alloc_devt(struct hd_struct *part, dev_t *devt) +{ + struct gendisk *disk = part_to_disk(part); + int idx, rc; + + /* in consecutive minor range? */ + if (part->partno < disk->minors) { + *devt = MKDEV(disk->major, disk->first_minor + part->partno); + return 0; + } + + /* allocate ext devt */ + do { + if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL)) + return -ENOMEM; + rc = idr_get_new(&ext_devt_idr, part, &idx); + } while (rc == -EAGAIN); + + if (rc) + return rc; + + if (idx > MAX_EXT_DEVT) { + idr_remove(&ext_devt_idr, idx); + return -EBUSY; + } + + *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); + return 0; +} + +/** + * blk_free_devt - free a dev_t + * @devt: dev_t to free + * + * Free @devt which was allocated using blk_alloc_devt(). + * + * CONTEXT: + * Might sleep. + */ +void blk_free_devt(dev_t devt) +{ + might_sleep(); + + if (devt == MKDEV(0, 0)) + return; + + if (MAJOR(devt) == BLOCK_EXT_MAJOR) { + mutex_lock(&ext_devt_mutex); + idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); + mutex_unlock(&ext_devt_mutex); + } +} + +static char *bdevt_str(dev_t devt, char *buf) +{ + if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { + char tbuf[BDEVT_SIZE]; + snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt)); + snprintf(buf, BDEVT_SIZE, "%-9s", tbuf); + } else + snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); + + return buf; +} + +/* + * Register device numbers dev..(dev+range-1) + * range must be nonzero + * The hash chain is sorted on range, so that subranges can override. + */ +void blk_register_region(dev_t devt, unsigned long range, struct module *module, + struct kobject *(*probe)(dev_t, int *, void *), + int (*lock)(dev_t, void *), void *data) +{ + kobj_map(bdev_map, devt, range, module, probe, lock, data); +} + +EXPORT_SYMBOL(blk_register_region); + +void blk_unregister_region(dev_t devt, unsigned long range) +{ + kobj_unmap(bdev_map, devt, range); +} + +EXPORT_SYMBOL(blk_unregister_region); + +static struct kobject *exact_match(dev_t devt, int *partno, void *data) +{ + struct gendisk *p = data; + + return &disk_to_dev(p)->kobj; +} + +static int exact_lock(dev_t devt, void *data) +{ + struct gendisk *p = data; + + if (!get_disk(p)) + return -1; + return 0; +} + +#ifndef DDE_LINUX +/** + * add_disk - add partitioning information to kernel list + * @disk: per-device partitioning information + * + * This function registers the partitioning information in @disk + * with the kernel. + * + * FIXME: error handling + */ +void add_disk(struct gendisk *disk) +{ + struct backing_dev_info *bdi; + dev_t devt; + int retval; + + /* minors == 0 indicates to use ext devt from part0 and should + * be accompanied with EXT_DEVT flag. Make sure all + * parameters make sense. + */ + WARN_ON(disk->minors && !(disk->major || disk->first_minor)); + WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT)); + + disk->flags |= GENHD_FL_UP; + + retval = blk_alloc_devt(&disk->part0, &devt); + if (retval) { + WARN_ON(1); + return; + } + disk_to_dev(disk)->devt = devt; + + /* ->major and ->first_minor aren't supposed to be + * dereferenced from here on, but set them just in case. + */ + disk->major = MAJOR(devt); + disk->first_minor = MINOR(devt); + + blk_register_region(disk_devt(disk), disk->minors, NULL, + exact_match, exact_lock, disk); + register_disk(disk); + blk_register_queue(disk); + + bdi = &disk->queue->backing_dev_info; + bdi_register_dev(bdi, disk_devt(disk)); + retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, + "bdi"); + WARN_ON(retval); +} + +EXPORT_SYMBOL(add_disk); +EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ +#endif + +void unlink_gendisk(struct gendisk *disk) +{ + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); + bdi_unregister(&disk->queue->backing_dev_info); + blk_unregister_queue(disk); + blk_unregister_region(disk_devt(disk), disk->minors); +} + +#ifndef DDE_LINUX +/** + * get_gendisk - get partitioning information for a given device + * @devt: device to get partitioning information for + * @partno: returned partition index + * + * This function gets the structure containing partitioning + * information for the given device @devt. + */ +struct gendisk *get_gendisk(dev_t devt, int *partno) +{ + struct gendisk *disk = NULL; + + if (MAJOR(devt) != BLOCK_EXT_MAJOR) { + struct kobject *kobj; + + kobj = kobj_lookup(bdev_map, devt, partno); + if (kobj) + disk = dev_to_disk(kobj_to_dev(kobj)); + } else { + struct hd_struct *part; + + mutex_lock(&ext_devt_mutex); + part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); + if (part && get_disk(part_to_disk(part))) { + *partno = part->partno; + disk = part_to_disk(part); + } + mutex_unlock(&ext_devt_mutex); + } + + return disk; +} +#endif + +/** + * bdget_disk - do bdget() by gendisk and partition number + * @disk: gendisk of interest + * @partno: partition number + * + * Find partition @partno from @disk, do bdget() on it. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * Resulting block_device on success, NULL on failure. + */ +struct block_device *bdget_disk(struct gendisk *disk, int partno) +{ + struct hd_struct *part; + struct block_device *bdev = NULL; + + part = disk_get_part(disk, partno); + if (part) + bdev = bdget(part_devt(part)); + disk_put_part(part); + + return bdev; +} +EXPORT_SYMBOL(bdget_disk); + +/* + * print a full list of all partitions - intended for places where the root + * filesystem can't be mounted and thus to give the victim some idea of what + * went wrong + */ +void __init printk_all_partitions(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct disk_part_iter piter; + struct hd_struct *part; + char name_buf[BDEVNAME_SIZE]; + char devt_buf[BDEVT_SIZE]; + + /* + * Don't show empty devices or things that have been + * surpressed + */ + if (get_capacity(disk) == 0 || + (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) + continue; + + /* + * Note, unlike /proc/partitions, I am showing the + * numbers in hex - the same format as the root= + * option takes. + */ + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) { + bool is_part0 = part == &disk->part0; + + printk("%s%s %10llu %s", is_part0 ? "" : " ", + bdevt_str(part_devt(part), devt_buf), + (unsigned long long)part->nr_sects >> 1, + disk_name(disk, part->partno, name_buf)); + if (is_part0) { + if (disk->driverfs_dev != NULL && + disk->driverfs_dev->driver != NULL) + printk(" driver: %s\n", + disk->driverfs_dev->driver->name); + else + printk(" (driver?)\n"); + } else + printk("\n"); + } + disk_part_iter_exit(&piter); + } + class_dev_iter_exit(&iter); +} + +#ifdef CONFIG_PROC_FS +/* iterator */ +static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) +{ + loff_t skip = *pos; + struct class_dev_iter *iter; + struct device *dev; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return ERR_PTR(-ENOMEM); + + seqf->private = iter; + class_dev_iter_init(iter, &block_class, NULL, &disk_type); + do { + dev = class_dev_iter_next(iter); + if (!dev) + return NULL; + } while (skip--); + + return dev_to_disk(dev); +} + +static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos) +{ + struct device *dev; + + (*pos)++; + dev = class_dev_iter_next(seqf->private); + if (dev) + return dev_to_disk(dev); + + return NULL; +} + +static void disk_seqf_stop(struct seq_file *seqf, void *v) +{ + struct class_dev_iter *iter = seqf->private; + + /* stop is called even after start failed :-( */ + if (iter) { + class_dev_iter_exit(iter); + kfree(iter); + } +} + +static void *show_partition_start(struct seq_file *seqf, loff_t *pos) +{ + static void *p; + + p = disk_seqf_start(seqf, pos); + if (!IS_ERR(p) && p && !*pos) + seq_puts(seqf, "major minor #blocks name\n\n"); + return p; +} + +static int show_partition(struct seq_file *seqf, void *v) +{ + struct gendisk *sgp = v; + struct disk_part_iter piter; + struct hd_struct *part; + char buf[BDEVNAME_SIZE]; + + /* Don't show non-partitionable removeable devices or empty devices */ + if (!get_capacity(sgp) || (!disk_partitionable(sgp) && + (sgp->flags & GENHD_FL_REMOVABLE))) + return 0; + if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) + return 0; + + /* show the full disk and all non-0 size partitions of it */ + disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) + seq_printf(seqf, "%4d %7d %10llu %s\n", + MAJOR(part_devt(part)), MINOR(part_devt(part)), + (unsigned long long)part->nr_sects >> 1, + disk_name(sgp, part->partno, buf)); + disk_part_iter_exit(&piter); + + return 0; +} + +static const struct seq_operations partitions_op = { + .start = show_partition_start, + .next = disk_seqf_next, + .stop = disk_seqf_stop, + .show = show_partition +}; + +static int partitions_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &partitions_op); +} + +static const struct file_operations proc_partitions_operations = { + .open = partitions_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + + +static struct kobject *base_probe(dev_t devt, int *partno, void *data) +{ + if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) + /* Make old-style 2.4 aliases work */ + request_module("block-major-%d", MAJOR(devt)); + return NULL; +} + +static int __init genhd_device_init(void) +{ + int error; + + block_class.dev_kobj = sysfs_dev_block_kobj; + error = class_register(&block_class); + if (unlikely(error)) + return error; + bdev_map = kobj_map_init(base_probe, &block_class_lock); + blk_dev_init(); + + register_blkdev(BLOCK_EXT_MAJOR, "blkext"); + +#ifndef CONFIG_SYSFS_DEPRECATED + /* create top-level block dir */ + block_depr = kobject_create_and_add("block", NULL); +#endif + return 0; +} + +subsys_initcall(genhd_device_init); + +static ssize_t disk_range_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk->minors); +} + +static ssize_t disk_ext_range_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk_max_parts(disk)); +} + +static ssize_t disk_removable_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", + (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); +} + +static ssize_t disk_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); +} + +static ssize_t disk_capability_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%x\n", disk->flags); +} + +static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); +static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); +static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); +static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); +static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); +static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +#ifdef CONFIG_FAIL_MAKE_REQUEST +static struct device_attribute dev_attr_fail = + __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); +#endif +#ifdef CONFIG_FAIL_IO_TIMEOUT +static struct device_attribute dev_attr_fail_timeout = + __ATTR(io-timeout-fail, S_IRUGO|S_IWUSR, part_timeout_show, + part_timeout_store); +#endif + +static struct attribute *disk_attrs[] = { + &dev_attr_range.attr, + &dev_attr_ext_range.attr, + &dev_attr_removable.attr, + &dev_attr_ro.attr, + &dev_attr_size.attr, + &dev_attr_capability.attr, + &dev_attr_stat.attr, +#ifdef CONFIG_FAIL_MAKE_REQUEST + &dev_attr_fail.attr, +#endif +#ifdef CONFIG_FAIL_IO_TIMEOUT + &dev_attr_fail_timeout.attr, +#endif + NULL +}; + +static struct attribute_group disk_attr_group = { + .attrs = disk_attrs, +}; + +static struct attribute_group *disk_attr_groups[] = { + &disk_attr_group, + NULL +}; + +static void disk_free_ptbl_rcu_cb(struct rcu_head *head) +{ + struct disk_part_tbl *ptbl = + container_of(head, struct disk_part_tbl, rcu_head); + + kfree(ptbl); +} + +/** + * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way + * @disk: disk to replace part_tbl for + * @new_ptbl: new part_tbl to install + * + * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The + * original ptbl is freed using RCU callback. + * + * LOCKING: + * Matching bd_mutx locked. + */ +static void disk_replace_part_tbl(struct gendisk *disk, + struct disk_part_tbl *new_ptbl) +{ + struct disk_part_tbl *old_ptbl = disk->part_tbl; + + rcu_assign_pointer(disk->part_tbl, new_ptbl); + + if (old_ptbl) { + rcu_assign_pointer(old_ptbl->last_lookup, NULL); +#ifndef DDE_LINUX + call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); +#else + disk_free_ptbl_rcu_cb(&old_ptbl->rcu_head); +#endif + } +} + +/** + * disk_expand_part_tbl - expand disk->part_tbl + * @disk: disk to expand part_tbl for + * @partno: expand such that this partno can fit in + * + * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl + * uses RCU to allow unlocked dereferencing for stats and other stuff. + * + * LOCKING: + * Matching bd_mutex locked, might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int disk_expand_part_tbl(struct gendisk *disk, int partno) +{ + struct disk_part_tbl *old_ptbl = disk->part_tbl; + struct disk_part_tbl *new_ptbl; + int len = old_ptbl ? old_ptbl->len : 0; + int target = partno + 1; + size_t size; + int i; + + /* disk_max_parts() is zero during initialization, ignore if so */ + if (disk_max_parts(disk) && target > disk_max_parts(disk)) + return -EINVAL; + + if (target <= len) + return 0; + + size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]); + new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id); + if (!new_ptbl) + return -ENOMEM; + + INIT_RCU_HEAD(&new_ptbl->rcu_head); + new_ptbl->len = target; + + for (i = 0; i < len; i++) + rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); + + disk_replace_part_tbl(disk, new_ptbl); + return 0; +} + +static void disk_release(struct device *dev) +{ + struct gendisk *disk = dev_to_disk(dev); + + kfree(disk->random); + disk_replace_part_tbl(disk, NULL); + free_part_stats(&disk->part0); + kfree(disk); +} +struct class block_class = { + .name = "block", +}; + +static struct device_type disk_type = { + .name = "disk", + .groups = disk_attr_groups, + .release = disk_release, +}; + +#ifdef CONFIG_PROC_FS +/* + * aggregate disk stat collector. Uses the same stats that the sysfs + * entries do, above, but makes them available through one seq_file. + * + * The output looks suspiciously like /proc/partitions with a bunch of + * extra fields. + */ +static int diskstats_show(struct seq_file *seqf, void *v) +{ + struct gendisk *gp = v; + struct disk_part_iter piter; + struct hd_struct *hd; + char buf[BDEVNAME_SIZE]; + int cpu; + + /* + if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) + seq_puts(seqf, "major minor name" + " rio rmerge rsect ruse wio wmerge " + "wsect wuse running use aveq" + "\n\n"); + */ + + disk_part_iter_init(&piter, gp, DISK_PITER_INCL_PART0); + while ((hd = disk_part_iter_next(&piter))) { + cpu = part_stat_lock(); + part_round_stats(cpu, hd); + part_stat_unlock(); + seq_printf(seqf, "%4d %7d %s %lu %lu %llu " + "%u %lu %lu %llu %u %u %u %u\n", + MAJOR(part_devt(hd)), MINOR(part_devt(hd)), + disk_name(gp, hd->partno, buf), + part_stat_read(hd, ios[0]), + part_stat_read(hd, merges[0]), + (unsigned long long)part_stat_read(hd, sectors[0]), + jiffies_to_msecs(part_stat_read(hd, ticks[0])), + part_stat_read(hd, ios[1]), + part_stat_read(hd, merges[1]), + (unsigned long long)part_stat_read(hd, sectors[1]), + jiffies_to_msecs(part_stat_read(hd, ticks[1])), + hd->in_flight, + jiffies_to_msecs(part_stat_read(hd, io_ticks)), + jiffies_to_msecs(part_stat_read(hd, time_in_queue)) + ); + } + disk_part_iter_exit(&piter); + + return 0; +} + +static const struct seq_operations diskstats_op = { + .start = disk_seqf_start, + .next = disk_seqf_next, + .stop = disk_seqf_stop, + .show = diskstats_show +}; + +static int diskstats_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &diskstats_op); +} + +static const struct file_operations proc_diskstats_operations = { + .open = diskstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_genhd_init(void) +{ + proc_create("diskstats", 0, NULL, &proc_diskstats_operations); + proc_create("partitions", 0, NULL, &proc_partitions_operations); + return 0; +} +module_init(proc_genhd_init); +#endif /* CONFIG_PROC_FS */ + +static void media_change_notify_thread(struct work_struct *work) +{ + struct gendisk *gd = container_of(work, struct gendisk, async_notify); + char event[] = "MEDIA_CHANGE=1"; + char *envp[] = { event, NULL }; + + /* + * set enviroment vars to indicate which event this is for + * so that user space will know to go check the media status. + */ + kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); + put_device(gd->driverfs_dev); +} + +#if 0 +void genhd_media_change_notify(struct gendisk *disk) +{ + get_device(disk->driverfs_dev); + schedule_work(&disk->async_notify); +} +EXPORT_SYMBOL_GPL(genhd_media_change_notify); +#endif /* 0 */ + +dev_t blk_lookup_devt(const char *name, int partno) +{ + dev_t devt = MKDEV(0, 0); + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct hd_struct *part; + + if (strcmp(dev_name(dev), name)) + continue; + + if (partno < disk->minors) { + /* We need to return the right devno, even + * if the partition doesn't exist yet. + */ + devt = MKDEV(MAJOR(dev->devt), + MINOR(dev->devt) + partno); + break; + } + part = disk_get_part(disk, partno); + if (part) { + devt = part_devt(part); + disk_put_part(part); + break; + } + disk_put_part(part); + } + class_dev_iter_exit(&iter); + return devt; +} +EXPORT_SYMBOL(blk_lookup_devt); + +struct gendisk *alloc_disk(int minors) +{ + return alloc_disk_node(minors, -1); +} +EXPORT_SYMBOL(alloc_disk); + +struct gendisk *alloc_disk_node(int minors, int node_id) +{ + struct gendisk *disk; + + disk = kmalloc_node(sizeof(struct gendisk), + GFP_KERNEL | __GFP_ZERO, node_id); + if (disk) { + if (!init_part_stats(&disk->part0)) { + kfree(disk); + return NULL; + } + disk->node_id = node_id; + if (disk_expand_part_tbl(disk, 0)) { + free_part_stats(&disk->part0); + kfree(disk); + return NULL; + } + disk->part_tbl->part[0] = &disk->part0; + + disk->minors = minors; +#ifndef DDE_LINUX + rand_initialize_disk(disk); +#endif + disk_to_dev(disk)->class = &block_class; + disk_to_dev(disk)->type = &disk_type; + device_initialize(disk_to_dev(disk)); + INIT_WORK(&disk->async_notify, + media_change_notify_thread); + } + return disk; +} +EXPORT_SYMBOL(alloc_disk_node); + +struct kobject *get_disk(struct gendisk *disk) +{ + struct module *owner; + struct kobject *kobj; + + if (!disk->fops) + return NULL; + owner = disk->fops->owner; + if (owner && !try_module_get(owner)) + return NULL; + kobj = kobject_get(&disk_to_dev(disk)->kobj); + if (kobj == NULL) { + module_put(owner); + return NULL; + } + return kobj; + +} + +EXPORT_SYMBOL(get_disk); + +void put_disk(struct gendisk *disk) +{ + if (disk) + kobject_put(&disk_to_dev(disk)->kobj); +} + +EXPORT_SYMBOL(put_disk); + +void set_device_ro(struct block_device *bdev, int flag) +{ + bdev->bd_part->policy = flag; +} + +EXPORT_SYMBOL(set_device_ro); + +void set_disk_ro(struct gendisk *disk, int flag) +{ + struct disk_part_iter piter; + struct hd_struct *part; + + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) + part->policy = flag; + disk_part_iter_exit(&piter); +} + +EXPORT_SYMBOL(set_disk_ro); + +int bdev_read_only(struct block_device *bdev) +{ + if (!bdev) + return 0; + return bdev->bd_part->policy; +} + +EXPORT_SYMBOL(bdev_read_only); + +int invalidate_partition(struct gendisk *disk, int partno) +{ + int res = 0; + struct block_device *bdev = bdget_disk(disk, partno); + if (bdev) { + fsync_bdev(bdev); + res = __invalidate_device(bdev); + bdput(bdev); + } + return res; +} + +EXPORT_SYMBOL(invalidate_partition); diff --git a/libdde-linux26/lib/src/drivers/base/class.c b/libdde-linux26/lib/src/drivers/base/class.c new file mode 100644 index 00000000..1417d80b --- /dev/null +++ b/libdde-linux26/lib/src/drivers/base/class.c @@ -0,0 +1,505 @@ +/* + * class.c - basic device class management + * + * Copyright (c) 2002-3 Patrick Mochel + * Copyright (c) 2002-3 Open Source Development Labs + * Copyright (c) 2003-2004 Greg Kroah-Hartman + * Copyright (c) 2003-2004 IBM Corp. + * + * This file is released under the GPLv2 + * + */ + +#include <linux/device.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/kdev_t.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/genhd.h> +#include <linux/mutex.h> +#include "base.h" + +#define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr) + +static ssize_t class_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct class_attribute *class_attr = to_class_attr(attr); + struct class_private *cp = to_class(kobj); + ssize_t ret = -EIO; + + if (class_attr->show) + ret = class_attr->show(cp->class, buf); + return ret; +} + +static ssize_t class_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct class_attribute *class_attr = to_class_attr(attr); + struct class_private *cp = to_class(kobj); + ssize_t ret = -EIO; + + if (class_attr->store) + ret = class_attr->store(cp->class, buf, count); + return ret; +} + +static void class_release(struct kobject *kobj) +{ + struct class_private *cp = to_class(kobj); + struct class *class = cp->class; + + pr_debug("class '%s': release.\n", class->name); + + if (class->class_release) + class->class_release(class); + else + pr_debug("class '%s' does not have a release() function, " + "be careful\n", class->name); +} + +static struct sysfs_ops class_sysfs_ops = { + .show = class_attr_show, + .store = class_attr_store, +}; + +static struct kobj_type class_ktype = { + .sysfs_ops = &class_sysfs_ops, + .release = class_release, +}; + +/* Hotplug events for classes go to the class class_subsys */ +static struct kset *class_kset; + + +int class_create_file(struct class *cls, const struct class_attribute *attr) +{ + int error; + if (cls) + error = sysfs_create_file(&cls->p->class_subsys.kobj, + &attr->attr); + else + error = -EINVAL; + return error; +} + +void class_remove_file(struct class *cls, const struct class_attribute *attr) +{ + if (cls) + sysfs_remove_file(&cls->p->class_subsys.kobj, &attr->attr); +} + +static struct class *class_get(struct class *cls) +{ + if (cls) + kset_get(&cls->p->class_subsys); + return cls; +} + +static void class_put(struct class *cls) +{ + if (cls) + kset_put(&cls->p->class_subsys); +} + +static int add_class_attrs(struct class *cls) +{ + int i; + int error = 0; + + if (cls->class_attrs) { + for (i = 0; attr_name(cls->class_attrs[i]); i++) { + error = class_create_file(cls, &cls->class_attrs[i]); + if (error) + goto error; + } + } +done: + return error; +error: + while (--i >= 0) + class_remove_file(cls, &cls->class_attrs[i]); + goto done; +} + +static void remove_class_attrs(struct class *cls) +{ + int i; + + if (cls->class_attrs) { + for (i = 0; attr_name(cls->class_attrs[i]); i++) + class_remove_file(cls, &cls->class_attrs[i]); + } +} + +static void klist_class_dev_get(struct klist_node *n) +{ + struct device *dev = container_of(n, struct device, knode_class); + + get_device(dev); +} + +static void klist_class_dev_put(struct klist_node *n) +{ + struct device *dev = container_of(n, struct device, knode_class); + + put_device(dev); +} + +int __class_register(struct class *cls, struct lock_class_key *key) +{ + struct class_private *cp; + int error; + + pr_debug("device class '%s': registering\n", cls->name); + + cp = kzalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) + return -ENOMEM; + klist_init(&cp->class_devices, klist_class_dev_get, klist_class_dev_put); + INIT_LIST_HEAD(&cp->class_interfaces); + kset_init(&cp->class_dirs); + __mutex_init(&cp->class_mutex, "struct class mutex", key); + error = kobject_set_name(&cp->class_subsys.kobj, "%s", cls->name); + if (error) { + kfree(cp); + return error; + } + + /* set the default /sys/dev directory for devices of this class */ + if (!cls->dev_kobj) + cls->dev_kobj = sysfs_dev_char_kobj; + +#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK) && !defined(DDE_LINUX) + /* let the block class directory show up in the root of sysfs */ + if (cls != &block_class) + cp->class_subsys.kobj.kset = class_kset; +#else + cp->class_subsys.kobj.kset = class_kset; +#endif + cp->class_subsys.kobj.ktype = &class_ktype; + cp->class = cls; + cls->p = cp; + + error = kset_register(&cp->class_subsys); + if (error) { + kfree(cp); + return error; + } + error = add_class_attrs(class_get(cls)); + class_put(cls); + return error; +} +EXPORT_SYMBOL_GPL(__class_register); + +void class_unregister(struct class *cls) +{ + pr_debug("device class '%s': unregistering\n", cls->name); + remove_class_attrs(cls); + kset_unregister(&cls->p->class_subsys); +} + +static void class_create_release(struct class *cls) +{ + pr_debug("%s called for %s\n", __func__, cls->name); + kfree(cls); +} + +/** + * class_create - create a struct class structure + * @owner: pointer to the module that is to "own" this struct class + * @name: pointer to a string for the name of this class. + * @key: the lock_class_key for this class; used by mutex lock debugging + * + * This is used to create a struct class pointer that can then be used + * in calls to device_create(). + * + * Note, the pointer created here is to be destroyed when finished by + * making a call to class_destroy(). + */ +struct class *__class_create(struct module *owner, const char *name, + struct lock_class_key *key) +{ + struct class *cls; + int retval; + + cls = kzalloc(sizeof(*cls), GFP_KERNEL); + if (!cls) { + retval = -ENOMEM; + goto error; + } + + cls->name = name; + cls->owner = owner; + cls->class_release = class_create_release; + + retval = __class_register(cls, key); + if (retval) + goto error; + + return cls; + +error: + kfree(cls); + return ERR_PTR(retval); +} +EXPORT_SYMBOL_GPL(__class_create); + +/** + * class_destroy - destroys a struct class structure + * @cls: pointer to the struct class that is to be destroyed + * + * Note, the pointer to be destroyed must have been created with a call + * to class_create(). + */ +void class_destroy(struct class *cls) +{ + if ((cls == NULL) || (IS_ERR(cls))) + return; + + class_unregister(cls); +} + +#ifdef CONFIG_SYSFS_DEPRECATED +char *make_class_name(const char *name, struct kobject *kobj) +{ + char *class_name; + int size; + + size = strlen(name) + strlen(kobject_name(kobj)) + 2; + + class_name = kmalloc(size, GFP_KERNEL); + if (!class_name) + return NULL; + + strcpy(class_name, name); + strcat(class_name, ":"); + strcat(class_name, kobject_name(kobj)); + return class_name; +} +#endif + +/** + * class_dev_iter_init - initialize class device iterator + * @iter: class iterator to initialize + * @class: the class we wanna iterate over + * @start: the device to start iterating from, if any + * @type: device_type of the devices to iterate over, NULL for all + * + * Initialize class iterator @iter such that it iterates over devices + * of @class. If @start is set, the list iteration will start there, + * otherwise if it is NULL, the iteration starts at the beginning of + * the list. + */ +void class_dev_iter_init(struct class_dev_iter *iter, struct class *class, + struct device *start, const struct device_type *type) +{ + struct klist_node *start_knode = NULL; + + if (start) + start_knode = &start->knode_class; + klist_iter_init_node(&class->p->class_devices, &iter->ki, start_knode); + iter->type = type; +} +EXPORT_SYMBOL_GPL(class_dev_iter_init); + +/** + * class_dev_iter_next - iterate to the next device + * @iter: class iterator to proceed + * + * Proceed @iter to the next device and return it. Returns NULL if + * iteration is complete. + * + * The returned device is referenced and won't be released till + * iterator is proceed to the next device or exited. The caller is + * free to do whatever it wants to do with the device including + * calling back into class code. + */ +struct device *class_dev_iter_next(struct class_dev_iter *iter) +{ + struct klist_node *knode; + struct device *dev; + + while (1) { + knode = klist_next(&iter->ki); + if (!knode) + return NULL; + dev = container_of(knode, struct device, knode_class); + if (!iter->type || iter->type == dev->type) + return dev; + } +} +EXPORT_SYMBOL_GPL(class_dev_iter_next); + +/** + * class_dev_iter_exit - finish iteration + * @iter: class iterator to finish + * + * Finish an iteration. Always call this function after iteration is + * complete whether the iteration ran till the end or not. + */ +void class_dev_iter_exit(struct class_dev_iter *iter) +{ + klist_iter_exit(&iter->ki); +} +EXPORT_SYMBOL_GPL(class_dev_iter_exit); + +/** + * class_for_each_device - device iterator + * @class: the class we're iterating + * @start: the device to start with in the list, if any. + * @data: data for the callback + * @fn: function to be called for each device + * + * Iterate over @class's list of devices, and call @fn for each, + * passing it @data. If @start is set, the list iteration will start + * there, otherwise if it is NULL, the iteration starts at the + * beginning of the list. + * + * We check the return of @fn each time. If it returns anything + * other than 0, we break out and return that value. + * + * @fn is allowed to do anything including calling back into class + * code. There's no locking restriction. + */ +int class_for_each_device(struct class *class, struct device *start, + void *data, int (*fn)(struct device *, void *)) +{ + struct class_dev_iter iter; + struct device *dev; + int error = 0; + + if (!class) + return -EINVAL; + if (!class->p) { + WARN(1, "%s called for class '%s' before it was initialized", + __func__, class->name); + return -EINVAL; + } + + class_dev_iter_init(&iter, class, start, NULL); + while ((dev = class_dev_iter_next(&iter))) { + error = fn(dev, data); + if (error) + break; + } + class_dev_iter_exit(&iter); + + return error; +} +EXPORT_SYMBOL_GPL(class_for_each_device); + +/** + * class_find_device - device iterator for locating a particular device + * @class: the class we're iterating + * @start: Device to begin with + * @data: data for the match function + * @match: function to check device + * + * This is similar to the class_for_each_dev() function above, but it + * returns a reference to a device that is 'found' for later use, as + * determined by the @match callback. + * + * The callback should return 0 if the device doesn't match and non-zero + * if it does. If the callback returns non-zero, this function will + * return to the caller and not iterate over any more devices. + * + * Note, you will need to drop the reference with put_device() after use. + * + * @fn is allowed to do anything including calling back into class + * code. There's no locking restriction. + */ +struct device *class_find_device(struct class *class, struct device *start, + void *data, + int (*match)(struct device *, void *)) +{ + struct class_dev_iter iter; + struct device *dev; + + if (!class) + return NULL; + if (!class->p) { + WARN(1, "%s called for class '%s' before it was initialized", + __func__, class->name); + return NULL; + } + + class_dev_iter_init(&iter, class, start, NULL); + while ((dev = class_dev_iter_next(&iter))) { + if (match(dev, data)) { + get_device(dev); + break; + } + } + class_dev_iter_exit(&iter); + + return dev; +} +EXPORT_SYMBOL_GPL(class_find_device); + +int class_interface_register(struct class_interface *class_intf) +{ + struct class *parent; + struct class_dev_iter iter; + struct device *dev; + + if (!class_intf || !class_intf->class) + return -ENODEV; + + parent = class_get(class_intf->class); + if (!parent) + return -EINVAL; + + mutex_lock(&parent->p->class_mutex); + list_add_tail(&class_intf->node, &parent->p->class_interfaces); + if (class_intf->add_dev) { + class_dev_iter_init(&iter, parent, NULL, NULL); + while ((dev = class_dev_iter_next(&iter))) + class_intf->add_dev(dev, class_intf); + class_dev_iter_exit(&iter); + } + mutex_unlock(&parent->p->class_mutex); + + return 0; +} + +void class_interface_unregister(struct class_interface *class_intf) +{ + struct class *parent = class_intf->class; + struct class_dev_iter iter; + struct device *dev; + + if (!parent) + return; + + mutex_lock(&parent->p->class_mutex); + list_del_init(&class_intf->node); + if (class_intf->remove_dev) { + class_dev_iter_init(&iter, parent, NULL, NULL); + while ((dev = class_dev_iter_next(&iter))) + class_intf->remove_dev(dev, class_intf); + class_dev_iter_exit(&iter); + } + mutex_unlock(&parent->p->class_mutex); + + class_put(parent); +} + +int __init classes_init(void) +{ + class_kset = kset_create_and_add("class", NULL, NULL); + if (!class_kset) + return -ENOMEM; + return 0; +} + +EXPORT_SYMBOL_GPL(class_create_file); +EXPORT_SYMBOL_GPL(class_remove_file); +EXPORT_SYMBOL_GPL(class_unregister); +EXPORT_SYMBOL_GPL(class_destroy); + +EXPORT_SYMBOL_GPL(class_interface_register); +EXPORT_SYMBOL_GPL(class_interface_unregister); diff --git a/libdde-linux26/lib/src/drivers/base/core.c b/libdde-linux26/lib/src/drivers/base/core.c new file mode 100644 index 00000000..e3800714 --- /dev/null +++ b/libdde-linux26/lib/src/drivers/base/core.c @@ -0,0 +1,1633 @@ +/* + * drivers/base/core.c - core driver model code (device registration, etc) + * + * Copyright (c) 2002-3 Patrick Mochel + * Copyright (c) 2002-3 Open Source Development Labs + * Copyright (c) 2006 Greg Kroah-Hartman <gregkh@suse.de> + * Copyright (c) 2006 Novell, Inc. + * + * This file is released under the GPLv2 + * + */ + +#include <linux/device.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/kdev_t.h> +#include <linux/notifier.h> +#include <linux/genhd.h> +#include <linux/kallsyms.h> +#include <linux/semaphore.h> +#include <linux/mutex.h> + +#include "base.h" +#include "power/power.h" + +int (*platform_notify)(struct device *dev) = NULL; +int (*platform_notify_remove)(struct device *dev) = NULL; +static struct kobject *dev_kobj; +struct kobject *sysfs_dev_char_kobj; +struct kobject *sysfs_dev_block_kobj; + +#ifdef DDE_LINUX +#include "local.h" +#endif + +#if defined(CONFIG_BLOCK) && !defined(DDE_LINUX) +static inline int device_is_not_partition(struct device *dev) +{ + return !(dev->type == &part_type); +} +#else +static inline int device_is_not_partition(struct device *dev) +{ + return 1; +} +#endif + +/** + * dev_driver_string - Return a device's driver name, if at all possible + * @dev: struct device to get the name of + * + * Will return the device's driver's name if it is bound to a device. If + * the device is not bound to a device, it will return the name of the bus + * it is attached to. If it is not attached to a bus either, an empty + * string will be returned. + */ +const char *dev_driver_string(const struct device *dev) +{ + return dev->driver ? dev->driver->name : + (dev->bus ? dev->bus->name : + (dev->class ? dev->class->name : "")); +} +EXPORT_SYMBOL(dev_driver_string); + +#define to_dev(obj) container_of(obj, struct device, kobj) +#define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr) + +static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct device_attribute *dev_attr = to_dev_attr(attr); + struct device *dev = to_dev(kobj); + ssize_t ret = -EIO; + + if (dev_attr->show) + ret = dev_attr->show(dev, dev_attr, buf); + if (ret >= (ssize_t)PAGE_SIZE) { + print_symbol("dev_attr_show: %s returned bad count\n", + (unsigned long)dev_attr->show); + } + return ret; +} + +static ssize_t dev_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct device_attribute *dev_attr = to_dev_attr(attr); + struct device *dev = to_dev(kobj); + ssize_t ret = -EIO; + + if (dev_attr->store) + ret = dev_attr->store(dev, dev_attr, buf, count); + return ret; +} + +static struct sysfs_ops dev_sysfs_ops = { + .show = dev_attr_show, + .store = dev_attr_store, +}; + + +/** + * device_release - free device structure. + * @kobj: device's kobject. + * + * This is called once the reference count for the object + * reaches 0. We forward the call to the device's release + * method, which should handle actually freeing the structure. + */ +static void device_release(struct kobject *kobj) +{ + struct device *dev = to_dev(kobj); + + if (dev->release) + dev->release(dev); + else if (dev->type && dev->type->release) + dev->type->release(dev); + else if (dev->class && dev->class->dev_release) + dev->class->dev_release(dev); + else + WARN(1, KERN_ERR "Device '%s' does not have a release() " + "function, it is broken and must be fixed.\n", + dev_name(dev)); +} + +static struct kobj_type device_ktype = { + .release = device_release, + .sysfs_ops = &dev_sysfs_ops, +}; + + +static int dev_uevent_filter(struct kset *kset, struct kobject *kobj) +{ + struct kobj_type *ktype = get_ktype(kobj); + + if (ktype == &device_ktype) { + struct device *dev = to_dev(kobj); + if (dev->uevent_suppress) + return 0; + if (dev->bus) + return 1; + if (dev->class) + return 1; + } + return 0; +} + +static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj) +{ + struct device *dev = to_dev(kobj); + + if (dev->bus) + return dev->bus->name; + if (dev->class) + return dev->class->name; + return NULL; +} + +static int dev_uevent(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env) +{ + struct device *dev = to_dev(kobj); + int retval = 0; + +#ifndef DDE_LINUX + /* add the major/minor if present */ + if (MAJOR(dev->devt)) { + add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt)); + add_uevent_var(env, "MINOR=%u", MINOR(dev->devt)); + } + + if (dev->type && dev->type->name) + add_uevent_var(env, "DEVTYPE=%s", dev->type->name); + + if (dev->driver) + add_uevent_var(env, "DRIVER=%s", dev->driver->name); + +#ifdef CONFIG_SYSFS_DEPRECATED + if (dev->class) { + struct device *parent = dev->parent; + + /* find first bus device in parent chain */ + while (parent && !parent->bus) + parent = parent->parent; + if (parent && parent->bus) { + const char *path; + + path = kobject_get_path(&parent->kobj, GFP_KERNEL); + if (path) { + add_uevent_var(env, "PHYSDEVPATH=%s", path); + kfree(path); + } + + add_uevent_var(env, "PHYSDEVBUS=%s", parent->bus->name); + + if (parent->driver) + add_uevent_var(env, "PHYSDEVDRIVER=%s", + parent->driver->name); + } + } else if (dev->bus) { + add_uevent_var(env, "PHYSDEVBUS=%s", dev->bus->name); + + if (dev->driver) + add_uevent_var(env, "PHYSDEVDRIVER=%s", + dev->driver->name); + } +#endif + + /* have the bus specific function add its stuff */ + if (dev->bus && dev->bus->uevent) { + retval = dev->bus->uevent(dev, env); + if (retval) + pr_debug("device: '%s': %s: bus uevent() returned %d\n", + dev_name(dev), __func__, retval); + } + + /* have the class specific function add its stuff */ + if (dev->class && dev->class->dev_uevent) { + retval = dev->class->dev_uevent(dev, env); + if (retval) + pr_debug("device: '%s': %s: class uevent() " + "returned %d\n", dev_name(dev), + __func__, retval); + } + + /* have the device type specific fuction add its stuff */ + if (dev->type && dev->type->uevent) { + retval = dev->type->uevent(dev, env); + if (retval) + pr_debug("device: '%s': %s: dev_type uevent() " + "returned %d\n", dev_name(dev), + __func__, retval); + } +#endif + + return retval; +} + +static struct kset_uevent_ops device_uevent_ops = { + .filter = dev_uevent_filter, + .name = dev_uevent_name, + .uevent = dev_uevent, +}; + +static ssize_t show_uevent(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct kobject *top_kobj; + struct kset *kset; + struct kobj_uevent_env *env = NULL; + int i; + size_t count = 0; + int retval; + + /* search the kset, the device belongs to */ + top_kobj = &dev->kobj; + while (!top_kobj->kset && top_kobj->parent) + top_kobj = top_kobj->parent; + if (!top_kobj->kset) + goto out; + + kset = top_kobj->kset; + if (!kset->uevent_ops || !kset->uevent_ops->uevent) + goto out; + + /* respect filter */ + if (kset->uevent_ops && kset->uevent_ops->filter) + if (!kset->uevent_ops->filter(kset, &dev->kobj)) + goto out; + + env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + /* let the kset specific function add its keys */ + retval = kset->uevent_ops->uevent(kset, &dev->kobj, env); + if (retval) + goto out; + + /* copy keys to file */ + for (i = 0; i < env->envp_idx; i++) + count += sprintf(&buf[count], "%s\n", env->envp[i]); +out: + kfree(env); + return count; +} + +static ssize_t store_uevent(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + enum kobject_action action; + + if (kobject_action_type(buf, count, &action) == 0) { + kobject_uevent(&dev->kobj, action); + goto out; + } + + dev_err(dev, "uevent: unsupported action-string; this will " + "be ignored in a future kernel version\n"); + kobject_uevent(&dev->kobj, KOBJ_ADD); +out: + return count; +} + +static struct device_attribute uevent_attr = + __ATTR(uevent, S_IRUGO | S_IWUSR, show_uevent, store_uevent); + +static int device_add_attributes(struct device *dev, + struct device_attribute *attrs) +{ + int error = 0; + int i; + + if (attrs) { + for (i = 0; attr_name(attrs[i]); i++) { + error = device_create_file(dev, &attrs[i]); + if (error) + break; + } + if (error) + while (--i >= 0) + device_remove_file(dev, &attrs[i]); + } + return error; +} + +static void device_remove_attributes(struct device *dev, + struct device_attribute *attrs) +{ + int i; + + if (attrs) + for (i = 0; attr_name(attrs[i]); i++) + device_remove_file(dev, &attrs[i]); +} + +static int device_add_groups(struct device *dev, + struct attribute_group **groups) +{ + int error = 0; + int i; + + if (groups) { + for (i = 0; groups[i]; i++) { + error = sysfs_create_group(&dev->kobj, groups[i]); + if (error) { + while (--i >= 0) + sysfs_remove_group(&dev->kobj, + groups[i]); + break; + } + } + } + return error; +} + +static void device_remove_groups(struct device *dev, + struct attribute_group **groups) +{ + int i; + + if (groups) + for (i = 0; groups[i]; i++) + sysfs_remove_group(&dev->kobj, groups[i]); +} + +static int device_add_attrs(struct device *dev) +{ + struct class *class = dev->class; + struct device_type *type = dev->type; + int error; + + if (class) { + error = device_add_attributes(dev, class->dev_attrs); + if (error) + return error; + } + + if (type) { + error = device_add_groups(dev, type->groups); + if (error) + goto err_remove_class_attrs; + } + + error = device_add_groups(dev, dev->groups); + if (error) + goto err_remove_type_groups; + + return 0; + + err_remove_type_groups: + if (type) + device_remove_groups(dev, type->groups); + err_remove_class_attrs: + if (class) + device_remove_attributes(dev, class->dev_attrs); + + return error; +} + +static void device_remove_attrs(struct device *dev) +{ + struct class *class = dev->class; + struct device_type *type = dev->type; + + device_remove_groups(dev, dev->groups); + + if (type) + device_remove_groups(dev, type->groups); + + if (class) + device_remove_attributes(dev, class->dev_attrs); +} + + +static ssize_t show_dev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return print_dev_t(buf, dev->devt); +} + +static struct device_attribute devt_attr = + __ATTR(dev, S_IRUGO, show_dev, NULL); + +/* kset to create /sys/devices/ */ +struct kset *devices_kset; + +/** + * device_create_file - create sysfs attribute file for device. + * @dev: device. + * @attr: device attribute descriptor. + */ +int device_create_file(struct device *dev, struct device_attribute *attr) +{ + int error = 0; + if (dev) + error = sysfs_create_file(&dev->kobj, &attr->attr); + return error; +} + +/** + * device_remove_file - remove sysfs attribute file. + * @dev: device. + * @attr: device attribute descriptor. + */ +void device_remove_file(struct device *dev, struct device_attribute *attr) +{ + if (dev) + sysfs_remove_file(&dev->kobj, &attr->attr); +} + +/** + * device_create_bin_file - create sysfs binary attribute file for device. + * @dev: device. + * @attr: device binary attribute descriptor. + */ +int device_create_bin_file(struct device *dev, struct bin_attribute *attr) +{ + int error = -EINVAL; + if (dev) + error = sysfs_create_bin_file(&dev->kobj, attr); + return error; +} +EXPORT_SYMBOL_GPL(device_create_bin_file); + +/** + * device_remove_bin_file - remove sysfs binary attribute file + * @dev: device. + * @attr: device binary attribute descriptor. + */ +void device_remove_bin_file(struct device *dev, struct bin_attribute *attr) +{ + if (dev) + sysfs_remove_bin_file(&dev->kobj, attr); +} +EXPORT_SYMBOL_GPL(device_remove_bin_file); + +/** + * device_schedule_callback_owner - helper to schedule a callback for a device + * @dev: device. + * @func: callback function to invoke later. + * @owner: module owning the callback routine + * + * Attribute methods must not unregister themselves or their parent device + * (which would amount to the same thing). Attempts to do so will deadlock, + * since unregistration is mutually exclusive with driver callbacks. + * + * Instead methods can call this routine, which will attempt to allocate + * and schedule a workqueue request to call back @func with @dev as its + * argument in the workqueue's process context. @dev will be pinned until + * @func returns. + * + * This routine is usually called via the inline device_schedule_callback(), + * which automatically sets @owner to THIS_MODULE. + * + * Returns 0 if the request was submitted, -ENOMEM if storage could not + * be allocated, -ENODEV if a reference to @owner isn't available. + * + * NOTE: This routine won't work if CONFIG_SYSFS isn't set! It uses an + * underlying sysfs routine (since it is intended for use by attribute + * methods), and if sysfs isn't available you'll get nothing but -ENOSYS. + */ +int device_schedule_callback_owner(struct device *dev, + void (*func)(struct device *), struct module *owner) +{ + return sysfs_schedule_callback(&dev->kobj, + (void (*)(void *)) func, dev, owner); +} +EXPORT_SYMBOL_GPL(device_schedule_callback_owner); + +static void klist_children_get(struct klist_node *n) +{ + struct device *dev = container_of(n, struct device, knode_parent); + + get_device(dev); +} + +static void klist_children_put(struct klist_node *n) +{ + struct device *dev = container_of(n, struct device, knode_parent); + + put_device(dev); +} + +/** + * device_initialize - init device structure. + * @dev: device. + * + * This prepares the device for use by other layers by initializing + * its fields. + * It is the first half of device_register(), if called by + * that function, though it can also be called separately, so one + * may use @dev's fields. In particular, get_device()/put_device() + * may be used for reference counting of @dev after calling this + * function. + * + * NOTE: Use put_device() to give up your reference instead of freeing + * @dev directly once you have called this function. + */ +void device_initialize(struct device *dev) +{ + dev->kobj.kset = devices_kset; + kobject_init(&dev->kobj, &device_ktype); + klist_init(&dev->klist_children, klist_children_get, + klist_children_put); + INIT_LIST_HEAD(&dev->dma_pools); + init_MUTEX(&dev->sem); + spin_lock_init(&dev->devres_lock); + INIT_LIST_HEAD(&dev->devres_head); + device_init_wakeup(dev, 0); + device_pm_init(dev); + set_dev_node(dev, -1); +} + +#ifdef CONFIG_SYSFS_DEPRECATED +static struct kobject *get_device_parent(struct device *dev, + struct device *parent) +{ + /* class devices without a parent live in /sys/class/<classname>/ */ + if (dev->class && (!parent || parent->class != dev->class)) + return &dev->class->p->class_subsys.kobj; + /* all other devices keep their parent */ + else if (parent) + return &parent->kobj; + + return NULL; +} + +static inline void cleanup_device_parent(struct device *dev) {} +static inline void cleanup_glue_dir(struct device *dev, + struct kobject *glue_dir) {} +#else +static struct kobject *virtual_device_parent(struct device *dev) +{ + static struct kobject *virtual_dir = NULL; + + if (!virtual_dir) + virtual_dir = kobject_create_and_add("virtual", + &devices_kset->kobj); + + return virtual_dir; +} + +static struct kobject *get_device_parent(struct device *dev, + struct device *parent) +{ + int retval; + + if (dev->class) { + struct kobject *kobj = NULL; + struct kobject *parent_kobj; + struct kobject *k; + + /* + * If we have no parent, we live in "virtual". + * Class-devices with a non class-device as parent, live + * in a "glue" directory to prevent namespace collisions. + */ + if (parent == NULL) + parent_kobj = virtual_device_parent(dev); + else if (parent->class) + return &parent->kobj; + else + parent_kobj = &parent->kobj; + + /* find our class-directory at the parent and reference it */ + spin_lock(&dev->class->p->class_dirs.list_lock); + list_for_each_entry(k, &dev->class->p->class_dirs.list, entry) + if (k->parent == parent_kobj) { + kobj = kobject_get(k); + break; + } + spin_unlock(&dev->class->p->class_dirs.list_lock); + if (kobj) + return kobj; + + /* or create a new class-directory at the parent device */ + k = kobject_create(); + if (!k) + return NULL; + k->kset = &dev->class->p->class_dirs; + retval = kobject_add(k, parent_kobj, "%s", dev->class->name); + if (retval < 0) { + kobject_put(k); + return NULL; + } + /* do not emit an uevent for this simple "glue" directory */ + return k; + } + + if (parent) + return &parent->kobj; + return NULL; +} + +static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir) +{ + /* see if we live in a "glue" directory */ + if (!glue_dir || !dev->class || + glue_dir->kset != &dev->class->p->class_dirs) + return; + + kobject_put(glue_dir); +} + +static void cleanup_device_parent(struct device *dev) +{ + cleanup_glue_dir(dev, dev->kobj.parent); +} +#endif + +static void setup_parent(struct device *dev, struct device *parent) +{ + struct kobject *kobj; + kobj = get_device_parent(dev, parent); + if (kobj) + dev->kobj.parent = kobj; +} + +static int device_add_class_symlinks(struct device *dev) +{ + int error; + + if (!dev->class) + return 0; + + error = sysfs_create_link(&dev->kobj, + &dev->class->p->class_subsys.kobj, + "subsystem"); + if (error) + goto out; + +#ifdef CONFIG_SYSFS_DEPRECATED + /* stacked class devices need a symlink in the class directory */ + if (dev->kobj.parent != &dev->class->p->class_subsys.kobj && + device_is_not_partition(dev)) { + error = sysfs_create_link(&dev->class->p->class_subsys.kobj, + &dev->kobj, dev_name(dev)); + if (error) + goto out_subsys; + } + + if (dev->parent && device_is_not_partition(dev)) { + struct device *parent = dev->parent; + char *class_name; + + /* + * stacked class devices have the 'device' link + * pointing to the bus device instead of the parent + */ + while (parent->class && !parent->bus && parent->parent) + parent = parent->parent; + + error = sysfs_create_link(&dev->kobj, + &parent->kobj, + "device"); + if (error) + goto out_busid; + + class_name = make_class_name(dev->class->name, + &dev->kobj); + if (class_name) + error = sysfs_create_link(&dev->parent->kobj, + &dev->kobj, class_name); + kfree(class_name); + if (error) + goto out_device; + } + return 0; + +out_device: + if (dev->parent && device_is_not_partition(dev)) + sysfs_remove_link(&dev->kobj, "device"); +out_busid + if (dev->kobj.parent != &dev->class->p->class_subsys.kobj && + device_is_not_partition(dev)) + sysfs_remove_link(&dev->class->p->class_subsys.kobj, + dev_name(dev)); +#else + /* link in the class directory pointing to the device */ + error = sysfs_create_link(&dev->class->p->class_subsys.kobj, + &dev->kobj, dev_name(dev)); + if (error) + goto out_subsys; + + if (dev->parent && device_is_not_partition(dev)) { + error = sysfs_create_link(&dev->kobj, &dev->parent->kobj, + "device"); + if (error) + goto out_busid; + } + return 0; + +out_busid: + sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev)); +#endif + +out_subsys: + sysfs_remove_link(&dev->kobj, "subsystem"); +out: + return error; +} + +static void device_remove_class_symlinks(struct device *dev) +{ + if (!dev->class) + return; + +#ifdef CONFIG_SYSFS_DEPRECATED + if (dev->parent && device_is_not_partition(dev)) { + char *class_name; + + class_name = make_class_name(dev->class->name, &dev->kobj); + if (class_name) { + sysfs_remove_link(&dev->parent->kobj, class_name); + kfree(class_name); + } + sysfs_remove_link(&dev->kobj, "device"); + } + + if (dev->kobj.parent != &dev->class->p->class_subsys.kobj && + device_is_not_partition(dev)) + sysfs_remove_link(&dev->class->p->class_subsys.kobj, + dev_name(dev)); +#else + if (dev->parent && device_is_not_partition(dev)) + sysfs_remove_link(&dev->kobj, "device"); + + sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev)); +#endif + + sysfs_remove_link(&dev->kobj, "subsystem"); +} + +/** + * dev_set_name - set a device name + * @dev: device + * @fmt: format string for the device's name + */ +int dev_set_name(struct device *dev, const char *fmt, ...) +{ + va_list vargs; + char *s; + + va_start(vargs, fmt); + vsnprintf(dev->bus_id, sizeof(dev->bus_id), fmt, vargs); + va_end(vargs); + + /* ewww... some of these buggers have / in the name... */ + while ((s = strchr(dev->bus_id, '/'))) + *s = '!'; + + return 0; +} +EXPORT_SYMBOL_GPL(dev_set_name); + +/** + * device_to_dev_kobj - select a /sys/dev/ directory for the device + * @dev: device + * + * By default we select char/ for new entries. Setting class->dev_obj + * to NULL prevents an entry from being created. class->dev_kobj must + * be set (or cleared) before any devices are registered to the class + * otherwise device_create_sys_dev_entry() and + * device_remove_sys_dev_entry() will disagree about the the presence + * of the link. + */ +static struct kobject *device_to_dev_kobj(struct device *dev) +{ + struct kobject *kobj; + + if (dev->class) + kobj = dev->class->dev_kobj; + else + kobj = sysfs_dev_char_kobj; + + return kobj; +} + +static int device_create_sys_dev_entry(struct device *dev) +{ + struct kobject *kobj = device_to_dev_kobj(dev); + int error = 0; + char devt_str[15]; + + if (kobj) { + format_dev_t(devt_str, dev->devt); + error = sysfs_create_link(kobj, &dev->kobj, devt_str); + } + + return error; +} + +static void device_remove_sys_dev_entry(struct device *dev) +{ + struct kobject *kobj = device_to_dev_kobj(dev); + char devt_str[15]; + + if (kobj) { + format_dev_t(devt_str, dev->devt); + sysfs_remove_link(kobj, devt_str); + } +} + +/** + * device_add - add device to device hierarchy. + * @dev: device. + * + * This is part 2 of device_register(), though may be called + * separately _iff_ device_initialize() has been called separately. + * + * This adds @dev to the kobject hierarchy via kobject_add(), adds it + * to the global and sibling lists for the device, then + * adds it to the other relevant subsystems of the driver model. + * + * NOTE: _Never_ directly free @dev after calling this function, even + * if it returned an error! Always use put_device() to give up your + * reference instead. + */ +int device_add(struct device *dev) +{ + struct device *parent = NULL; + struct class_interface *class_intf; + int error = -EINVAL; + + dev = get_device(dev); + if (!dev) + goto done; + + /* Temporarily support init_name if it is set. + * It will override bus_id for now */ + if (dev->init_name) + dev_set_name(dev, "%s", dev->init_name); + + if (!strlen(dev->bus_id)) + goto done; + + pr_debug("device: '%s': %s\n", dev_name(dev), __func__); + + parent = get_device(dev->parent); + setup_parent(dev, parent); + + /* use parent numa_node */ + if (parent) + set_dev_node(dev, dev_to_node(parent)); + + /* first, register with generic layer. */ + error = kobject_add(&dev->kobj, dev->kobj.parent, "%s", dev_name(dev)); + if (error) + goto Error; + + /* notify platform of device entry */ + if (platform_notify) + platform_notify(dev); + + error = device_create_file(dev, &uevent_attr); + if (error) + goto attrError; + + if (MAJOR(dev->devt)) { + error = device_create_file(dev, &devt_attr); + if (error) + goto ueventattrError; + + error = device_create_sys_dev_entry(dev); + if (error) + goto devtattrError; + } + + error = device_add_class_symlinks(dev); + if (error) + goto SymlinkError; + error = device_add_attrs(dev); + if (error) + goto AttrsError; + error = bus_add_device(dev); + if (error) + goto BusError; + error = dpm_sysfs_add(dev); + if (error) + goto DPMError; + device_pm_add(dev); + + /* Notify clients of device addition. This call must come + * after dpm_sysf_add() and before kobject_uevent(). + */ + if (dev->bus) + blocking_notifier_call_chain(&dev->bus->p->bus_notifier, + BUS_NOTIFY_ADD_DEVICE, dev); + + kobject_uevent(&dev->kobj, KOBJ_ADD); + bus_attach_device(dev); + if (parent) + klist_add_tail(&dev->knode_parent, &parent->klist_children); + + if (dev->class) { + mutex_lock(&dev->class->p->class_mutex); + /* tie the class to the device */ + klist_add_tail(&dev->knode_class, + &dev->class->p->class_devices); + + /* notify any interfaces that the device is here */ + list_for_each_entry(class_intf, + &dev->class->p->class_interfaces, node) + if (class_intf->add_dev) + class_intf->add_dev(dev, class_intf); + mutex_unlock(&dev->class->p->class_mutex); + } +done: + put_device(dev); + return error; + DPMError: + bus_remove_device(dev); + BusError: + device_remove_attrs(dev); + AttrsError: + device_remove_class_symlinks(dev); + SymlinkError: + if (MAJOR(dev->devt)) + device_remove_sys_dev_entry(dev); + devtattrError: + if (MAJOR(dev->devt)) + device_remove_file(dev, &devt_attr); + ueventattrError: + device_remove_file(dev, &uevent_attr); + attrError: + kobject_uevent(&dev->kobj, KOBJ_REMOVE); + kobject_del(&dev->kobj); + Error: + cleanup_device_parent(dev); + if (parent) + put_device(parent); + goto done; +} + +/** + * device_register - register a device with the system. + * @dev: pointer to the device structure + * + * This happens in two clean steps - initialize the device + * and add it to the system. The two steps can be called + * separately, but this is the easiest and most common. + * I.e. you should only call the two helpers separately if + * have a clearly defined need to use and refcount the device + * before it is added to the hierarchy. + * + * NOTE: _Never_ directly free @dev after calling this function, even + * if it returned an error! Always use put_device() to give up the + * reference initialized in this function instead. + */ +int device_register(struct device *dev) +{ + device_initialize(dev); + return device_add(dev); +} + +/** + * get_device - increment reference count for device. + * @dev: device. + * + * This simply forwards the call to kobject_get(), though + * we do take care to provide for the case that we get a NULL + * pointer passed in. + */ +struct device *get_device(struct device *dev) +{ + return dev ? to_dev(kobject_get(&dev->kobj)) : NULL; +} + +/** + * put_device - decrement reference count. + * @dev: device in question. + */ +void put_device(struct device *dev) +{ + /* might_sleep(); */ + if (dev) + kobject_put(&dev->kobj); +} + +/** + * device_del - delete device from system. + * @dev: device. + * + * This is the first part of the device unregistration + * sequence. This removes the device from the lists we control + * from here, has it removed from the other driver model + * subsystems it was added to in device_add(), and removes it + * from the kobject hierarchy. + * + * NOTE: this should be called manually _iff_ device_add() was + * also called manually. + */ +void device_del(struct device *dev) +{ + struct device *parent = dev->parent; + struct class_interface *class_intf; + + /* Notify clients of device removal. This call must come + * before dpm_sysfs_remove(). + */ + if (dev->bus) + blocking_notifier_call_chain(&dev->bus->p->bus_notifier, + BUS_NOTIFY_DEL_DEVICE, dev); + device_pm_remove(dev); + dpm_sysfs_remove(dev); + if (parent) + klist_del(&dev->knode_parent); + if (MAJOR(dev->devt)) { + device_remove_sys_dev_entry(dev); + device_remove_file(dev, &devt_attr); + } + if (dev->class) { + device_remove_class_symlinks(dev); + + mutex_lock(&dev->class->p->class_mutex); + /* notify any interfaces that the device is now gone */ + list_for_each_entry(class_intf, + &dev->class->p->class_interfaces, node) + if (class_intf->remove_dev) + class_intf->remove_dev(dev, class_intf); + /* remove the device from the class list */ + klist_del(&dev->knode_class); + mutex_unlock(&dev->class->p->class_mutex); + } + device_remove_file(dev, &uevent_attr); + device_remove_attrs(dev); + bus_remove_device(dev); + + /* + * Some platform devices are driven without driver attached + * and managed resources may have been acquired. Make sure + * all resources are released. + */ + devres_release_all(dev); + + /* Notify the platform of the removal, in case they + * need to do anything... + */ + if (platform_notify_remove) + platform_notify_remove(dev); + kobject_uevent(&dev->kobj, KOBJ_REMOVE); + cleanup_device_parent(dev); + kobject_del(&dev->kobj); + put_device(parent); +} + +/** + * device_unregister - unregister device from system. + * @dev: device going away. + * + * We do this in two parts, like we do device_register(). First, + * we remove it from all the subsystems with device_del(), then + * we decrement the reference count via put_device(). If that + * is the final reference count, the device will be cleaned up + * via device_release() above. Otherwise, the structure will + * stick around until the final reference to the device is dropped. + */ +void device_unregister(struct device *dev) +{ + pr_debug("device: '%s': %s\n", dev_name(dev), __func__); + device_del(dev); + put_device(dev); +} + +static struct device *next_device(struct klist_iter *i) +{ + struct klist_node *n = klist_next(i); + return n ? container_of(n, struct device, knode_parent) : NULL; +} + +/** + * device_for_each_child - device child iterator. + * @parent: parent struct device. + * @data: data for the callback. + * @fn: function to be called for each device. + * + * Iterate over @parent's child devices, and call @fn for each, + * passing it @data. + * + * We check the return of @fn each time. If it returns anything + * other than 0, we break out and return that value. + */ +int device_for_each_child(struct device *parent, void *data, + int (*fn)(struct device *dev, void *data)) +{ + struct klist_iter i; + struct device *child; + int error = 0; + + klist_iter_init(&parent->klist_children, &i); + while ((child = next_device(&i)) && !error) + error = fn(child, data); + klist_iter_exit(&i); + return error; +} + +/** + * device_find_child - device iterator for locating a particular device. + * @parent: parent struct device + * @data: Data to pass to match function + * @match: Callback function to check device + * + * This is similar to the device_for_each_child() function above, but it + * returns a reference to a device that is 'found' for later use, as + * determined by the @match callback. + * + * The callback should return 0 if the device doesn't match and non-zero + * if it does. If the callback returns non-zero and a reference to the + * current device can be obtained, this function will return to the caller + * and not iterate over any more devices. + */ +struct device *device_find_child(struct device *parent, void *data, + int (*match)(struct device *dev, void *data)) +{ + struct klist_iter i; + struct device *child; + + if (!parent) + return NULL; + + klist_iter_init(&parent->klist_children, &i); + while ((child = next_device(&i))) + if (match(child, data) && get_device(child)) + break; + klist_iter_exit(&i); + return child; +} + +int __init devices_init(void) +{ + devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL); + if (!devices_kset) + return -ENOMEM; + dev_kobj = kobject_create_and_add("dev", NULL); + if (!dev_kobj) + goto dev_kobj_err; + sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj); + if (!sysfs_dev_block_kobj) + goto block_kobj_err; + sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj); + if (!sysfs_dev_char_kobj) + goto char_kobj_err; + + return 0; + + char_kobj_err: + kobject_put(sysfs_dev_block_kobj); + block_kobj_err: + kobject_put(dev_kobj); + dev_kobj_err: + kset_unregister(devices_kset); + return -ENOMEM; +} + +EXPORT_SYMBOL_GPL(device_for_each_child); +EXPORT_SYMBOL_GPL(device_find_child); + +EXPORT_SYMBOL_GPL(device_initialize); +EXPORT_SYMBOL_GPL(device_add); +EXPORT_SYMBOL_GPL(device_register); + +EXPORT_SYMBOL_GPL(device_del); +EXPORT_SYMBOL_GPL(device_unregister); +EXPORT_SYMBOL_GPL(get_device); +EXPORT_SYMBOL_GPL(put_device); + +EXPORT_SYMBOL_GPL(device_create_file); +EXPORT_SYMBOL_GPL(device_remove_file); + +struct root_device +{ + struct device dev; + struct module *owner; +}; + +#define to_root_device(dev) container_of(dev, struct root_device, dev) + +static void root_device_release(struct device *dev) +{ + kfree(to_root_device(dev)); +} + +/** + * __root_device_register - allocate and register a root device + * @name: root device name + * @owner: owner module of the root device, usually THIS_MODULE + * + * This function allocates a root device and registers it + * using device_register(). In order to free the returned + * device, use root_device_unregister(). + * + * Root devices are dummy devices which allow other devices + * to be grouped under /sys/devices. Use this function to + * allocate a root device and then use it as the parent of + * any device which should appear under /sys/devices/{name} + * + * The /sys/devices/{name} directory will also contain a + * 'module' symlink which points to the @owner directory + * in sysfs. + * + * Note: You probably want to use root_device_register(). + */ +struct device *__root_device_register(const char *name, struct module *owner) +{ + struct root_device *root; + int err = -ENOMEM; + + root = kzalloc(sizeof(struct root_device), GFP_KERNEL); + if (!root) + return ERR_PTR(err); + + err = dev_set_name(&root->dev, name); + if (err) { + kfree(root); + return ERR_PTR(err); + } + + root->dev.release = root_device_release; + + err = device_register(&root->dev); + if (err) { + put_device(&root->dev); + return ERR_PTR(err); + } + +#ifdef CONFIG_MODULE /* gotta find a "cleaner" way to do this */ + if (owner) { + struct module_kobject *mk = &owner->mkobj; + + err = sysfs_create_link(&root->dev.kobj, &mk->kobj, "module"); + if (err) { + device_unregister(&root->dev); + return ERR_PTR(err); + } + root->owner = owner; + } +#endif + + return &root->dev; +} +EXPORT_SYMBOL_GPL(__root_device_register); + +/** + * root_device_unregister - unregister and free a root device + * @dev: device going away + * + * This function unregisters and cleans up a device that was created by + * root_device_register(). + */ +void root_device_unregister(struct device *dev) +{ + struct root_device *root = to_root_device(dev); + + if (root->owner) + sysfs_remove_link(&root->dev.kobj, "module"); + + device_unregister(dev); +} +EXPORT_SYMBOL_GPL(root_device_unregister); + + +static void device_create_release(struct device *dev) +{ + pr_debug("device: '%s': %s\n", dev_name(dev), __func__); + kfree(dev); +} + +/** + * device_create_vargs - creates a device and registers it with sysfs + * @class: pointer to the struct class that this device should be registered to + * @parent: pointer to the parent struct device of this new device, if any + * @devt: the dev_t for the char device to be added + * @drvdata: the data to be added to the device for callbacks + * @fmt: string for the device's name + * @args: va_list for the device's name + * + * This function can be used by char device classes. A struct device + * will be created in sysfs, registered to the specified class. + * + * A "dev" file will be created, showing the dev_t for the device, if + * the dev_t is not 0,0. + * If a pointer to a parent struct device is passed in, the newly created + * struct device will be a child of that device in sysfs. + * The pointer to the struct device will be returned from the call. + * Any further sysfs files that might be required can be created using this + * pointer. + * + * Note: the struct class passed to this function must have previously + * been created with a call to class_create(). + */ +struct device *device_create_vargs(struct class *class, struct device *parent, + dev_t devt, void *drvdata, const char *fmt, + va_list args) +{ + struct device *dev = NULL; + int retval = -ENODEV; + + if (class == NULL || IS_ERR(class)) + goto error; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + retval = -ENOMEM; + goto error; + } + + dev->devt = devt; + dev->class = class; + dev->parent = parent; + dev->release = device_create_release; + dev_set_drvdata(dev, drvdata); + + vsnprintf(dev->bus_id, BUS_ID_SIZE, fmt, args); + retval = device_register(dev); + if (retval) + goto error; + + return dev; + +error: + put_device(dev); + return ERR_PTR(retval); +} +EXPORT_SYMBOL_GPL(device_create_vargs); + +/** + * device_create - creates a device and registers it with sysfs + * @class: pointer to the struct class that this device should be registered to + * @parent: pointer to the parent struct device of this new device, if any + * @devt: the dev_t for the char device to be added + * @drvdata: the data to be added to the device for callbacks + * @fmt: string for the device's name + * + * This function can be used by char device classes. A struct device + * will be created in sysfs, registered to the specified class. + * + * A "dev" file will be created, showing the dev_t for the device, if + * the dev_t is not 0,0. + * If a pointer to a parent struct device is passed in, the newly created + * struct device will be a child of that device in sysfs. + * The pointer to the struct device will be returned from the call. + * Any further sysfs files that might be required can be created using this + * pointer. + * + * Note: the struct class passed to this function must have previously + * been created with a call to class_create(). + */ +struct device *device_create(struct class *class, struct device *parent, + dev_t devt, void *drvdata, const char *fmt, ...) +{ + va_list vargs; + struct device *dev; + + va_start(vargs, fmt); + dev = device_create_vargs(class, parent, devt, drvdata, fmt, vargs); + va_end(vargs); + return dev; +} +EXPORT_SYMBOL_GPL(device_create); + +static int __match_devt(struct device *dev, void *data) +{ + dev_t *devt = data; + + return dev->devt == *devt; +} + +/** + * device_destroy - removes a device that was created with device_create() + * @class: pointer to the struct class that this device was registered with + * @devt: the dev_t of the device that was previously registered + * + * This call unregisters and cleans up a device that was created with a + * call to device_create(). + */ +void device_destroy(struct class *class, dev_t devt) +{ + struct device *dev; + + dev = class_find_device(class, NULL, &devt, __match_devt); + if (dev) { + put_device(dev); + device_unregister(dev); + } +} +EXPORT_SYMBOL_GPL(device_destroy); + +/** + * device_rename - renames a device + * @dev: the pointer to the struct device to be renamed + * @new_name: the new name of the device + * + * It is the responsibility of the caller to provide mutual + * exclusion between two different calls of device_rename + * on the same device to ensure that new_name is valid and + * won't conflict with other devices. + */ +int device_rename(struct device *dev, char *new_name) +{ + char *old_class_name = NULL; + char *new_class_name = NULL; + char *old_device_name = NULL; + int error; + + dev = get_device(dev); + if (!dev) + return -EINVAL; + + pr_debug("device: '%s': %s: renaming to '%s'\n", dev_name(dev), + __func__, new_name); + +#ifdef CONFIG_SYSFS_DEPRECATED + if ((dev->class) && (dev->parent)) + old_class_name = make_class_name(dev->class->name, &dev->kobj); +#endif + + old_device_name = kmalloc(BUS_ID_SIZE, GFP_KERNEL); + if (!old_device_name) { + error = -ENOMEM; + goto out; + } + strlcpy(old_device_name, dev->bus_id, BUS_ID_SIZE); + strlcpy(dev->bus_id, new_name, BUS_ID_SIZE); + + error = kobject_rename(&dev->kobj, new_name); + if (error) { + strlcpy(dev->bus_id, old_device_name, BUS_ID_SIZE); + goto out; + } + +#ifdef CONFIG_SYSFS_DEPRECATED + if (old_class_name) { + new_class_name = make_class_name(dev->class->name, &dev->kobj); + if (new_class_name) { + error = sysfs_create_link_nowarn(&dev->parent->kobj, + &dev->kobj, + new_class_name); + if (error) + goto out; + sysfs_remove_link(&dev->parent->kobj, old_class_name); + } + } +#else + if (dev->class) { + error = sysfs_create_link_nowarn(&dev->class->p->class_subsys.kobj, + &dev->kobj, dev_name(dev)); + if (error) + goto out; + sysfs_remove_link(&dev->class->p->class_subsys.kobj, + old_device_name); + } +#endif + +out: + put_device(dev); + + kfree(new_class_name); + kfree(old_class_name); + kfree(old_device_name); + + return error; +} +EXPORT_SYMBOL_GPL(device_rename); + +static int device_move_class_links(struct device *dev, + struct device *old_parent, + struct device *new_parent) +{ + int error = 0; +#ifdef CONFIG_SYSFS_DEPRECATED + char *class_name; + + class_name = make_class_name(dev->class->name, &dev->kobj); + if (!class_name) { + error = -ENOMEM; + goto out; + } + if (old_parent) { + sysfs_remove_link(&dev->kobj, "device"); + sysfs_remove_link(&old_parent->kobj, class_name); + } + if (new_parent) { + error = sysfs_create_link(&dev->kobj, &new_parent->kobj, + "device"); + if (error) + goto out; + error = sysfs_create_link(&new_parent->kobj, &dev->kobj, + class_name); + if (error) + sysfs_remove_link(&dev->kobj, "device"); + } else + error = 0; +out: + kfree(class_name); + return error; +#else + if (old_parent) + sysfs_remove_link(&dev->kobj, "device"); + if (new_parent) + error = sysfs_create_link(&dev->kobj, &new_parent->kobj, + "device"); + return error; +#endif +} + +/** + * device_move - moves a device to a new parent + * @dev: the pointer to the struct device to be moved + * @new_parent: the new parent of the device (can by NULL) + */ +int device_move(struct device *dev, struct device *new_parent) +{ + int error; + struct device *old_parent; + struct kobject *new_parent_kobj; + + dev = get_device(dev); + if (!dev) + return -EINVAL; + + new_parent = get_device(new_parent); + new_parent_kobj = get_device_parent(dev, new_parent); + + pr_debug("device: '%s': %s: moving to '%s'\n", dev_name(dev), + __func__, new_parent ? dev_name(new_parent) : "<NULL>"); + error = kobject_move(&dev->kobj, new_parent_kobj); + if (error) { + cleanup_glue_dir(dev, new_parent_kobj); + put_device(new_parent); + goto out; + } + old_parent = dev->parent; + dev->parent = new_parent; + if (old_parent) + klist_remove(&dev->knode_parent); + if (new_parent) { + klist_add_tail(&dev->knode_parent, &new_parent->klist_children); + set_dev_node(dev, dev_to_node(new_parent)); + } + + if (!dev->class) + goto out_put; + error = device_move_class_links(dev, old_parent, new_parent); + if (error) { + /* We ignore errors on cleanup since we're hosed anyway... */ + device_move_class_links(dev, new_parent, old_parent); + if (!kobject_move(&dev->kobj, &old_parent->kobj)) { + if (new_parent) + klist_remove(&dev->knode_parent); + dev->parent = old_parent; + if (old_parent) { + klist_add_tail(&dev->knode_parent, + &old_parent->klist_children); + set_dev_node(dev, dev_to_node(old_parent)); + } + } + cleanup_glue_dir(dev, new_parent_kobj); + put_device(new_parent); + goto out; + } +out_put: + put_device(old_parent); +out: + put_device(dev); + return error; +} +EXPORT_SYMBOL_GPL(device_move); + +/** + * device_shutdown - call ->shutdown() on each device to shutdown. + */ +void device_shutdown(void) +{ + struct device *dev, *devn; + + list_for_each_entry_safe_reverse(dev, devn, &devices_kset->list, + kobj.entry) { + if (dev->bus && dev->bus->shutdown) { + dev_dbg(dev, "shutdown\n"); + dev->bus->shutdown(dev); + } else if (dev->driver && dev->driver->shutdown) { + dev_dbg(dev, "shutdown\n"); + dev->driver->shutdown(dev); + } + } + kobject_put(sysfs_dev_char_kobj); + kobject_put(sysfs_dev_block_kobj); + kobject_put(dev_kobj); +} diff --git a/libdde-linux26/lib/src/drivers/base/init.c b/libdde-linux26/lib/src/drivers/base/init.c new file mode 100644 index 00000000..ca5ac986 --- /dev/null +++ b/libdde-linux26/lib/src/drivers/base/init.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2002-3 Patrick Mochel + * Copyright (c) 2002-3 Open Source Development Labs + * + * This file is released under the GPLv2 + */ + +#include <linux/device.h> +#include <linux/init.h> +#include <linux/memory.h> + +#include "base.h" + +/** + * driver_init - initialize driver model. + * + * Call the driver model init functions to initialize their + * subsystems. Called early from init/main.c. + */ +void __init driver_init(void) +{ + /* These are the core pieces */ + devices_init(); + buses_init(); + classes_init(); +#ifndef DDE_LINUX + firmware_init(); + hypervisor_init(); +#endif + + /* These are also core pieces, but must come after the + * core core pieces. + */ + platform_bus_init(); +#ifndef DDE_LINUX + system_bus_init(); + cpu_dev_init(); + memory_dev_init(); + attribute_container_init(); +#endif +} diff --git a/libdde-linux26/lib/src/drivers/char/random.c b/libdde-linux26/lib/src/drivers/char/random.c new file mode 100644 index 00000000..0430c9d0 --- /dev/null +++ b/libdde-linux26/lib/src/drivers/char/random.c @@ -0,0 +1,1709 @@ +/* + * random.c -- A strong random number generator + * + * Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005 + * + * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All + * rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU General Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * (now, with legal B.S. out of the way.....) + * + * This routine gathers environmental noise from device drivers, etc., + * and returns good random numbers, suitable for cryptographic use. + * Besides the obvious cryptographic uses, these numbers are also good + * for seeding TCP sequence numbers, and other places where it is + * desirable to have numbers which are not only random, but hard to + * predict by an attacker. + * + * Theory of operation + * =================== + * + * Computers are very predictable devices. Hence it is extremely hard + * to produce truly random numbers on a computer --- as opposed to + * pseudo-random numbers, which can easily generated by using a + * algorithm. Unfortunately, it is very easy for attackers to guess + * the sequence of pseudo-random number generators, and for some + * applications this is not acceptable. So instead, we must try to + * gather "environmental noise" from the computer's environment, which + * must be hard for outside attackers to observe, and use that to + * generate random numbers. In a Unix environment, this is best done + * from inside the kernel. + * + * Sources of randomness from the environment include inter-keyboard + * timings, inter-interrupt timings from some interrupts, and other + * events which are both (a) non-deterministic and (b) hard for an + * outside observer to measure. Randomness from these sources are + * added to an "entropy pool", which is mixed using a CRC-like function. + * This is not cryptographically strong, but it is adequate assuming + * the randomness is not chosen maliciously, and it is fast enough that + * the overhead of doing it on every interrupt is very reasonable. + * As random bytes are mixed into the entropy pool, the routines keep + * an *estimate* of how many bits of randomness have been stored into + * the random number generator's internal state. + * + * When random bytes are desired, they are obtained by taking the SHA + * hash of the contents of the "entropy pool". The SHA hash avoids + * exposing the internal state of the entropy pool. It is believed to + * be computationally infeasible to derive any useful information + * about the input of SHA from its output. Even if it is possible to + * analyze SHA in some clever way, as long as the amount of data + * returned from the generator is less than the inherent entropy in + * the pool, the output data is totally unpredictable. For this + * reason, the routine decreases its internal estimate of how many + * bits of "true randomness" are contained in the entropy pool as it + * outputs random numbers. + * + * If this estimate goes to zero, the routine can still generate + * random numbers; however, an attacker may (at least in theory) be + * able to infer the future output of the generator from prior + * outputs. This requires successful cryptanalysis of SHA, which is + * not believed to be feasible, but there is a remote possibility. + * Nonetheless, these numbers should be useful for the vast majority + * of purposes. + * + * Exported interfaces ---- output + * =============================== + * + * There are three exported interfaces; the first is one designed to + * be used from within the kernel: + * + * void get_random_bytes(void *buf, int nbytes); + * + * This interface will return the requested number of random bytes, + * and place it in the requested buffer. + * + * The two other interfaces are two character devices /dev/random and + * /dev/urandom. /dev/random is suitable for use when very high + * quality randomness is desired (for example, for key generation or + * one-time pads), as it will only return a maximum of the number of + * bits of randomness (as estimated by the random number generator) + * contained in the entropy pool. + * + * The /dev/urandom device does not have this limit, and will return + * as many bytes as are requested. As more and more random bytes are + * requested without giving time for the entropy pool to recharge, + * this will result in random numbers that are merely cryptographically + * strong. For many applications, however, this is acceptable. + * + * Exported interfaces ---- input + * ============================== + * + * The current exported interfaces for gathering environmental noise + * from the devices are: + * + * void add_input_randomness(unsigned int type, unsigned int code, + * unsigned int value); + * void add_interrupt_randomness(int irq); + * + * add_input_randomness() uses the input layer interrupt timing, as well as + * the event type information from the hardware. + * + * add_interrupt_randomness() uses the inter-interrupt timing as random + * inputs to the entropy pool. Note that not all interrupts are good + * sources of randomness! For example, the timer interrupts is not a + * good choice, because the periodicity of the interrupts is too + * regular, and hence predictable to an attacker. Disk interrupts are + * a better measure, since the timing of the disk interrupts are more + * unpredictable. + * + * All of these routines try to estimate how many bits of randomness a + * particular randomness source. They do this by keeping track of the + * first and second order deltas of the event timings. + * + * Ensuring unpredictability at system startup + * ============================================ + * + * When any operating system starts up, it will go through a sequence + * of actions that are fairly predictable by an adversary, especially + * if the start-up does not involve interaction with a human operator. + * This reduces the actual number of bits of unpredictability in the + * entropy pool below the value in entropy_count. In order to + * counteract this effect, it helps to carry information in the + * entropy pool across shut-downs and start-ups. To do this, put the + * following lines an appropriate script which is run during the boot + * sequence: + * + * echo "Initializing random number generator..." + * random_seed=/var/run/random-seed + * # Carry a random seed from start-up to start-up + * # Load and then save the whole entropy pool + * if [ -f $random_seed ]; then + * cat $random_seed >/dev/urandom + * else + * touch $random_seed + * fi + * chmod 600 $random_seed + * dd if=/dev/urandom of=$random_seed count=1 bs=512 + * + * and the following lines in an appropriate script which is run as + * the system is shutdown: + * + * # Carry a random seed from shut-down to start-up + * # Save the whole entropy pool + * echo "Saving random seed..." + * random_seed=/var/run/random-seed + * touch $random_seed + * chmod 600 $random_seed + * dd if=/dev/urandom of=$random_seed count=1 bs=512 + * + * For example, on most modern systems using the System V init + * scripts, such code fragments would be found in + * /etc/rc.d/init.d/random. On older Linux systems, the correct script + * location might be in /etc/rcb.d/rc.local or /etc/rc.d/rc.0. + * + * Effectively, these commands cause the contents of the entropy pool + * to be saved at shut-down time and reloaded into the entropy pool at + * start-up. (The 'dd' in the addition to the bootup script is to + * make sure that /etc/random-seed is different for every start-up, + * even if the system crashes without executing rc.0.) Even with + * complete knowledge of the start-up activities, predicting the state + * of the entropy pool requires knowledge of the previous history of + * the system. + * + * Configuring the /dev/random driver under Linux + * ============================================== + * + * The /dev/random driver under Linux uses minor numbers 8 and 9 of + * the /dev/mem major number (#1). So if your system does not have + * /dev/random and /dev/urandom created already, they can be created + * by using the commands: + * + * mknod /dev/random c 1 8 + * mknod /dev/urandom c 1 9 + * + * Acknowledgements: + * ================= + * + * Ideas for constructing this random number generator were derived + * from Pretty Good Privacy's random number generator, and from private + * discussions with Phil Karn. Colin Plumb provided a faster random + * number generator, which speed up the mixing function of the entropy + * pool, taken from PGPfone. Dale Worley has also contributed many + * useful ideas and suggestions to improve this driver. + * + * Any flaws in the design are solely my responsibility, and should + * not be attributed to the Phil, Colin, or any of authors of PGP. + * + * Further background information on this topic may be obtained from + * RFC 1750, "Randomness Recommendations for Security", by Donald + * Eastlake, Steve Crocker, and Jeff Schiller. + */ + +#ifdef DDE_LINUX +#include <ddekit/resources.h> +#else + +#include <linux/utsname.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/string.h> +#include <linux/fcntl.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/poll.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/interrupt.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/percpu.h> +#include <linux/cryptohash.h> + +#include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/irq.h> +#include <asm/io.h> + +/* + * Configuration information + */ +#define INPUT_POOL_WORDS 128 +#define OUTPUT_POOL_WORDS 32 +#define SEC_XFER_SIZE 512 + +/* + * The minimum number of bits of entropy before we wake up a read on + * /dev/random. Should be enough to do a significant reseed. + */ +static int random_read_wakeup_thresh = 64; + +/* + * If the entropy count falls under this number of bits, then we + * should wake up processes which are selecting or polling on write + * access to /dev/random. + */ +static int random_write_wakeup_thresh = 128; + +/* + * When the input pool goes over trickle_thresh, start dropping most + * samples to avoid wasting CPU time and reduce lock contention. + */ + +static int trickle_thresh __read_mostly = INPUT_POOL_WORDS * 28; + +static DEFINE_PER_CPU(int, trickle_count); + +/* + * A pool of size .poolwords is stirred with a primitive polynomial + * of degree .poolwords over GF(2). The taps for various sizes are + * defined below. They are chosen to be evenly spaced (minimum RMS + * distance from evenly spaced; the numbers in the comments are a + * scaled squared error sum) except for the last tap, which is 1 to + * get the twisting happening as fast as possible. + */ +static struct poolinfo { + int poolwords; + int tap1, tap2, tap3, tap4, tap5; +} poolinfo_table[] = { + /* x^128 + x^103 + x^76 + x^51 +x^25 + x + 1 -- 105 */ + { 128, 103, 76, 51, 25, 1 }, + /* x^32 + x^26 + x^20 + x^14 + x^7 + x + 1 -- 15 */ + { 32, 26, 20, 14, 7, 1 }, +#if 0 + /* x^2048 + x^1638 + x^1231 + x^819 + x^411 + x + 1 -- 115 */ + { 2048, 1638, 1231, 819, 411, 1 }, + + /* x^1024 + x^817 + x^615 + x^412 + x^204 + x + 1 -- 290 */ + { 1024, 817, 615, 412, 204, 1 }, + + /* x^1024 + x^819 + x^616 + x^410 + x^207 + x^2 + 1 -- 115 */ + { 1024, 819, 616, 410, 207, 2 }, + + /* x^512 + x^411 + x^308 + x^208 + x^104 + x + 1 -- 225 */ + { 512, 411, 308, 208, 104, 1 }, + + /* x^512 + x^409 + x^307 + x^206 + x^102 + x^2 + 1 -- 95 */ + { 512, 409, 307, 206, 102, 2 }, + /* x^512 + x^409 + x^309 + x^205 + x^103 + x^2 + 1 -- 95 */ + { 512, 409, 309, 205, 103, 2 }, + + /* x^256 + x^205 + x^155 + x^101 + x^52 + x + 1 -- 125 */ + { 256, 205, 155, 101, 52, 1 }, + + /* x^128 + x^103 + x^78 + x^51 + x^27 + x^2 + 1 -- 70 */ + { 128, 103, 78, 51, 27, 2 }, + + /* x^64 + x^52 + x^39 + x^26 + x^14 + x + 1 -- 15 */ + { 64, 52, 39, 26, 14, 1 }, +#endif +}; + +#define POOLBITS poolwords*32 +#define POOLBYTES poolwords*4 + +/* + * For the purposes of better mixing, we use the CRC-32 polynomial as + * well to make a twisted Generalized Feedback Shift Reigster + * + * (See M. Matsumoto & Y. Kurita, 1992. Twisted GFSR generators. ACM + * Transactions on Modeling and Computer Simulation 2(3):179-194. + * Also see M. Matsumoto & Y. Kurita, 1994. Twisted GFSR generators + * II. ACM Transactions on Mdeling and Computer Simulation 4:254-266) + * + * Thanks to Colin Plumb for suggesting this. + * + * We have not analyzed the resultant polynomial to prove it primitive; + * in fact it almost certainly isn't. Nonetheless, the irreducible factors + * of a random large-degree polynomial over GF(2) are more than large enough + * that periodicity is not a concern. + * + * The input hash is much less sensitive than the output hash. All + * that we want of it is that it be a good non-cryptographic hash; + * i.e. it not produce collisions when fed "random" data of the sort + * we expect to see. As long as the pool state differs for different + * inputs, we have preserved the input entropy and done a good job. + * The fact that an intelligent attacker can construct inputs that + * will produce controlled alterations to the pool's state is not + * important because we don't consider such inputs to contribute any + * randomness. The only property we need with respect to them is that + * the attacker can't increase his/her knowledge of the pool's state. + * Since all additions are reversible (knowing the final state and the + * input, you can reconstruct the initial state), if an attacker has + * any uncertainty about the initial state, he/she can only shuffle + * that uncertainty about, but never cause any collisions (which would + * decrease the uncertainty). + * + * The chosen system lets the state of the pool be (essentially) the input + * modulo the generator polymnomial. Now, for random primitive polynomials, + * this is a universal class of hash functions, meaning that the chance + * of a collision is limited by the attacker's knowledge of the generator + * polynomail, so if it is chosen at random, an attacker can never force + * a collision. Here, we use a fixed polynomial, but we *can* assume that + * ###--> it is unknown to the processes generating the input entropy. <-### + * Because of this important property, this is a good, collision-resistant + * hash; hash collisions will occur no more often than chance. + */ + +/* + * Static global variables + */ +static DECLARE_WAIT_QUEUE_HEAD(random_read_wait); +static DECLARE_WAIT_QUEUE_HEAD(random_write_wait); +static struct fasync_struct *fasync; + +#if 0 +static int debug; +module_param(debug, bool, 0644); +#define DEBUG_ENT(fmt, arg...) do { \ + if (debug) \ + printk(KERN_DEBUG "random %04d %04d %04d: " \ + fmt,\ + input_pool.entropy_count,\ + blocking_pool.entropy_count,\ + nonblocking_pool.entropy_count,\ + ## arg); } while (0) +#else +#define DEBUG_ENT(fmt, arg...) do {} while (0) +#endif + +/********************************************************************** + * + * OS independent entropy store. Here are the functions which handle + * storing entropy in an entropy pool. + * + **********************************************************************/ + +struct entropy_store; +struct entropy_store { + /* read-only data: */ + struct poolinfo *poolinfo; + __u32 *pool; + const char *name; + int limit; + struct entropy_store *pull; + + /* read-write data: */ + spinlock_t lock; + unsigned add_ptr; + int entropy_count; + int input_rotate; +}; + +static __u32 input_pool_data[INPUT_POOL_WORDS]; +static __u32 blocking_pool_data[OUTPUT_POOL_WORDS]; +static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS]; + +static struct entropy_store input_pool = { + .poolinfo = &poolinfo_table[0], + .name = "input", + .limit = 1, + .lock = __SPIN_LOCK_UNLOCKED(&input_pool.lock), + .pool = input_pool_data +}; + +static struct entropy_store blocking_pool = { + .poolinfo = &poolinfo_table[1], + .name = "blocking", + .limit = 1, + .pull = &input_pool, + .lock = __SPIN_LOCK_UNLOCKED(&blocking_pool.lock), + .pool = blocking_pool_data +}; + +static struct entropy_store nonblocking_pool = { + .poolinfo = &poolinfo_table[1], + .name = "nonblocking", + .pull = &input_pool, + .lock = __SPIN_LOCK_UNLOCKED(&nonblocking_pool.lock), + .pool = nonblocking_pool_data +}; + +/* + * This function adds bytes into the entropy "pool". It does not + * update the entropy estimate. The caller should call + * credit_entropy_bits if this is appropriate. + * + * The pool is stirred with a primitive polynomial of the appropriate + * degree, and then twisted. We twist by three bits at a time because + * it's cheap to do so and helps slightly in the expected case where + * the entropy is concentrated in the low-order bits. + */ +static void mix_pool_bytes_extract(struct entropy_store *r, const void *in, + int nbytes, __u8 out[64]) +{ + static __u32 const twist_table[8] = { + 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158, + 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; + unsigned long i, j, tap1, tap2, tap3, tap4, tap5; + int input_rotate; + int wordmask = r->poolinfo->poolwords - 1; + const char *bytes = in; + __u32 w; + unsigned long flags; + + /* Taps are constant, so we can load them without holding r->lock. */ + tap1 = r->poolinfo->tap1; + tap2 = r->poolinfo->tap2; + tap3 = r->poolinfo->tap3; + tap4 = r->poolinfo->tap4; + tap5 = r->poolinfo->tap5; + + spin_lock_irqsave(&r->lock, flags); + input_rotate = r->input_rotate; + i = r->add_ptr; + + /* mix one byte at a time to simplify size handling and churn faster */ + while (nbytes--) { + w = rol32(*bytes++, input_rotate & 31); + i = (i - 1) & wordmask; + + /* XOR in the various taps */ + w ^= r->pool[i]; + w ^= r->pool[(i + tap1) & wordmask]; + w ^= r->pool[(i + tap2) & wordmask]; + w ^= r->pool[(i + tap3) & wordmask]; + w ^= r->pool[(i + tap4) & wordmask]; + w ^= r->pool[(i + tap5) & wordmask]; + + /* Mix the result back in with a twist */ + r->pool[i] = (w >> 3) ^ twist_table[w & 7]; + + /* + * Normally, we add 7 bits of rotation to the pool. + * At the beginning of the pool, add an extra 7 bits + * rotation, so that successive passes spread the + * input bits across the pool evenly. + */ + input_rotate += i ? 7 : 14; + } + + r->input_rotate = input_rotate; + r->add_ptr = i; + + if (out) + for (j = 0; j < 16; j++) + ((__u32 *)out)[j] = r->pool[(i - j) & wordmask]; + + spin_unlock_irqrestore(&r->lock, flags); +} + +static void mix_pool_bytes(struct entropy_store *r, const void *in, int bytes) +{ + mix_pool_bytes_extract(r, in, bytes, NULL); +} + +/* + * Credit (or debit) the entropy store with n bits of entropy + */ +static void credit_entropy_bits(struct entropy_store *r, int nbits) +{ + unsigned long flags; + int entropy_count; + + if (!nbits) + return; + + spin_lock_irqsave(&r->lock, flags); + + DEBUG_ENT("added %d entropy credits to %s\n", nbits, r->name); + entropy_count = r->entropy_count; + entropy_count += nbits; + if (entropy_count < 0) { + DEBUG_ENT("negative entropy/overflow\n"); + entropy_count = 0; + } else if (entropy_count > r->poolinfo->POOLBITS) + entropy_count = r->poolinfo->POOLBITS; + r->entropy_count = entropy_count; + + /* should we wake readers? */ + if (r == &input_pool && entropy_count >= random_read_wakeup_thresh) { + wake_up_interruptible(&random_read_wait); + kill_fasync(&fasync, SIGIO, POLL_IN); + } + spin_unlock_irqrestore(&r->lock, flags); +} + +/********************************************************************* + * + * Entropy input management + * + *********************************************************************/ + +/* There is one of these per entropy source */ +struct timer_rand_state { + cycles_t last_time; + long last_delta, last_delta2; + unsigned dont_count_entropy:1; +}; + +#ifndef CONFIG_SPARSE_IRQ + +static struct timer_rand_state *irq_timer_state[NR_IRQS]; + +static struct timer_rand_state *get_timer_rand_state(unsigned int irq) +{ + return irq_timer_state[irq]; +} + +static void set_timer_rand_state(unsigned int irq, + struct timer_rand_state *state) +{ + irq_timer_state[irq] = state; +} + +#else + +static struct timer_rand_state *get_timer_rand_state(unsigned int irq) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + return desc->timer_rand_state; +} + +static void set_timer_rand_state(unsigned int irq, + struct timer_rand_state *state) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + desc->timer_rand_state = state; +} +#endif + +static struct timer_rand_state input_timer_state; + +/* + * This function adds entropy to the entropy "pool" by using timing + * delays. It uses the timer_rand_state structure to make an estimate + * of how many bits of entropy this call has added to the pool. + * + * The number "num" is also added to the pool - it should somehow describe + * the type of event which just happened. This is currently 0-255 for + * keyboard scan codes, and 256 upwards for interrupts. + * + */ +static void add_timer_randomness(struct timer_rand_state *state, unsigned num) +{ + struct { + cycles_t cycles; + long jiffies; + unsigned num; + } sample; + long delta, delta2, delta3; + + preempt_disable(); + /* if over the trickle threshold, use only 1 in 4096 samples */ + if (input_pool.entropy_count > trickle_thresh && + (__get_cpu_var(trickle_count)++ & 0xfff)) + goto out; + + sample.jiffies = jiffies; + sample.cycles = get_cycles(); + sample.num = num; + mix_pool_bytes(&input_pool, &sample, sizeof(sample)); + + /* + * Calculate number of bits of randomness we probably added. + * We take into account the first, second and third-order deltas + * in order to make our estimate. + */ + + if (!state->dont_count_entropy) { + delta = sample.jiffies - state->last_time; + state->last_time = sample.jiffies; + + delta2 = delta - state->last_delta; + state->last_delta = delta; + + delta3 = delta2 - state->last_delta2; + state->last_delta2 = delta2; + + if (delta < 0) + delta = -delta; + if (delta2 < 0) + delta2 = -delta2; + if (delta3 < 0) + delta3 = -delta3; + if (delta > delta2) + delta = delta2; + if (delta > delta3) + delta = delta3; + + /* + * delta is now minimum absolute delta. + * Round down by 1 bit on general principles, + * and limit entropy entimate to 12 bits. + */ + credit_entropy_bits(&input_pool, + min_t(int, fls(delta>>1), 11)); + } +out: + preempt_enable(); +} + +void add_input_randomness(unsigned int type, unsigned int code, + unsigned int value) +{ + static unsigned char last_value; + + /* ignore autorepeat and the like */ + if (value == last_value) + return; + + DEBUG_ENT("input event\n"); + last_value = value; + add_timer_randomness(&input_timer_state, + (type << 4) ^ code ^ (code >> 4) ^ value); +} +EXPORT_SYMBOL_GPL(add_input_randomness); + +void add_interrupt_randomness(int irq) +{ + struct timer_rand_state *state; + + state = get_timer_rand_state(irq); + + if (state == NULL) + return; + + DEBUG_ENT("irq event %d\n", irq); + add_timer_randomness(state, 0x100 + irq); +} + +#ifdef CONFIG_BLOCK +void add_disk_randomness(struct gendisk *disk) +{ + if (!disk || !disk->random) + return; + /* first major is 1, so we get >= 0x200 here */ + DEBUG_ENT("disk event %d:%d\n", + MAJOR(disk_devt(disk)), MINOR(disk_devt(disk))); + + add_timer_randomness(disk->random, 0x100 + disk_devt(disk)); +} +#endif + +#define EXTRACT_SIZE 10 + +/********************************************************************* + * + * Entropy extraction routines + * + *********************************************************************/ + +static ssize_t extract_entropy(struct entropy_store *r, void *buf, + size_t nbytes, int min, int rsvd); + +/* + * This utility inline function is responsible for transfering entropy + * from the primary pool to the secondary extraction pool. We make + * sure we pull enough for a 'catastrophic reseed'. + */ +static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes) +{ + __u32 tmp[OUTPUT_POOL_WORDS]; + + if (r->pull && r->entropy_count < nbytes * 8 && + r->entropy_count < r->poolinfo->POOLBITS) { + /* If we're limited, always leave two wakeup worth's BITS */ + int rsvd = r->limit ? 0 : random_read_wakeup_thresh/4; + int bytes = nbytes; + + /* pull at least as many as BYTES as wakeup BITS */ + bytes = max_t(int, bytes, random_read_wakeup_thresh / 8); + /* but never more than the buffer size */ + bytes = min_t(int, bytes, sizeof(tmp)); + + DEBUG_ENT("going to reseed %s with %d bits " + "(%d of %d requested)\n", + r->name, bytes * 8, nbytes * 8, r->entropy_count); + + bytes = extract_entropy(r->pull, tmp, bytes, + random_read_wakeup_thresh / 8, rsvd); + mix_pool_bytes(r, tmp, bytes); + credit_entropy_bits(r, bytes*8); + } +} + +/* + * These functions extracts randomness from the "entropy pool", and + * returns it in a buffer. + * + * The min parameter specifies the minimum amount we can pull before + * failing to avoid races that defeat catastrophic reseeding while the + * reserved parameter indicates how much entropy we must leave in the + * pool after each pull to avoid starving other readers. + * + * Note: extract_entropy() assumes that .poolwords is a multiple of 16 words. + */ + +static size_t account(struct entropy_store *r, size_t nbytes, int min, + int reserved) +{ + unsigned long flags; + + /* Hold lock while accounting */ + spin_lock_irqsave(&r->lock, flags); + + BUG_ON(r->entropy_count > r->poolinfo->POOLBITS); + DEBUG_ENT("trying to extract %d bits from %s\n", + nbytes * 8, r->name); + + /* Can we pull enough? */ + if (r->entropy_count / 8 < min + reserved) { + nbytes = 0; + } else { + /* If limited, never pull more than available */ + if (r->limit && nbytes + reserved >= r->entropy_count / 8) + nbytes = r->entropy_count/8 - reserved; + + if (r->entropy_count / 8 >= nbytes + reserved) + r->entropy_count -= nbytes*8; + else + r->entropy_count = reserved; + + if (r->entropy_count < random_write_wakeup_thresh) { + wake_up_interruptible(&random_write_wait); + kill_fasync(&fasync, SIGIO, POLL_OUT); + } + } + + DEBUG_ENT("debiting %d entropy credits from %s%s\n", + nbytes * 8, r->name, r->limit ? "" : " (unlimited)"); + + spin_unlock_irqrestore(&r->lock, flags); + + return nbytes; +} + +static void extract_buf(struct entropy_store *r, __u8 *out) +{ + int i; + __u32 hash[5], workspace[SHA_WORKSPACE_WORDS]; + __u8 extract[64]; + + /* Generate a hash across the pool, 16 words (512 bits) at a time */ + sha_init(hash); + for (i = 0; i < r->poolinfo->poolwords; i += 16) + sha_transform(hash, (__u8 *)(r->pool + i), workspace); + + /* + * We mix the hash back into the pool to prevent backtracking + * attacks (where the attacker knows the state of the pool + * plus the current outputs, and attempts to find previous + * ouputs), unless the hash function can be inverted. By + * mixing at least a SHA1 worth of hash data back, we make + * brute-forcing the feedback as hard as brute-forcing the + * hash. + */ + mix_pool_bytes_extract(r, hash, sizeof(hash), extract); + + /* + * To avoid duplicates, we atomically extract a portion of the + * pool while mixing, and hash one final time. + */ + sha_transform(hash, extract, workspace); + memset(extract, 0, sizeof(extract)); + memset(workspace, 0, sizeof(workspace)); + + /* + * In case the hash function has some recognizable output + * pattern, we fold it in half. Thus, we always feed back + * twice as much data as we output. + */ + hash[0] ^= hash[3]; + hash[1] ^= hash[4]; + hash[2] ^= rol32(hash[2], 16); + memcpy(out, hash, EXTRACT_SIZE); + memset(hash, 0, sizeof(hash)); +} + +static ssize_t extract_entropy(struct entropy_store *r, void *buf, + size_t nbytes, int min, int reserved) +{ + ssize_t ret = 0, i; + __u8 tmp[EXTRACT_SIZE]; + + xfer_secondary_pool(r, nbytes); + nbytes = account(r, nbytes, min, reserved); + + while (nbytes) { + extract_buf(r, tmp); + i = min_t(int, nbytes, EXTRACT_SIZE); + memcpy(buf, tmp, i); + nbytes -= i; + buf += i; + ret += i; + } + + /* Wipe data just returned from memory */ + memset(tmp, 0, sizeof(tmp)); + + return ret; +} + +static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, + size_t nbytes) +{ + ssize_t ret = 0, i; + __u8 tmp[EXTRACT_SIZE]; + + xfer_secondary_pool(r, nbytes); + nbytes = account(r, nbytes, 0, 0); + + while (nbytes) { + if (need_resched()) { + if (signal_pending(current)) { + if (ret == 0) + ret = -ERESTARTSYS; + break; + } + schedule(); + } + + extract_buf(r, tmp); + i = min_t(int, nbytes, EXTRACT_SIZE); + if (copy_to_user(buf, tmp, i)) { + ret = -EFAULT; + break; + } + + nbytes -= i; + buf += i; + ret += i; + } + + /* Wipe data just returned from memory */ + memset(tmp, 0, sizeof(tmp)); + + return ret; +} + +#endif + +/* + * This function is the exported kernel interface. It returns some + * number of good random numbers, suitable for seeding TCP sequence + * numbers, etc. + */ +void get_random_bytes(void *buf, int nbytes) +{ +#ifndef DDE_LINUX + extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0); +#else + int i; + int nlwords = nbytes / sizeof (long); + for (i = 0; i < nlwords; i++) + ((long *) buf)[i] = ddekit_random (); + for (i = nlwords * sizeof (long); i < nbytes; i++) + ((char *) buf)[i] = (char) ddekit_random (); +#endif +} +EXPORT_SYMBOL(get_random_bytes); + +#ifndef DDE_LINUX +/* + * init_std_data - initialize pool with system data + * + * @r: pool to initialize + * + * This function clears the pool's entropy count and mixes some system + * data into the pool to prepare it for use. The pool is not cleared + * as that can only decrease the entropy in the pool. + */ +static void init_std_data(struct entropy_store *r) +{ + ktime_t now; + unsigned long flags; + + spin_lock_irqsave(&r->lock, flags); + r->entropy_count = 0; + spin_unlock_irqrestore(&r->lock, flags); + + now = ktime_get_real(); + mix_pool_bytes(r, &now, sizeof(now)); + mix_pool_bytes(r, utsname(), sizeof(*(utsname()))); +} + +static int rand_initialize(void) +{ + init_std_data(&input_pool); + init_std_data(&blocking_pool); + init_std_data(&nonblocking_pool); + return 0; +} +module_init(rand_initialize); + +void rand_initialize_irq(int irq) +{ + struct timer_rand_state *state; + + state = get_timer_rand_state(irq); + + if (state) + return; + + /* + * If kzalloc returns null, we just won't use that entropy + * source. + */ + state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL); + if (state) + set_timer_rand_state(irq, state); +} + +#ifdef CONFIG_BLOCK +void rand_initialize_disk(struct gendisk *disk) +{ + struct timer_rand_state *state; + + /* + * If kzalloc returns null, we just won't use that entropy + * source. + */ + state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL); + if (state) + disk->random = state; +} +#endif + +static ssize_t +random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) +{ + ssize_t n, retval = 0, count = 0; + + if (nbytes == 0) + return 0; + + while (nbytes > 0) { + n = nbytes; + if (n > SEC_XFER_SIZE) + n = SEC_XFER_SIZE; + + DEBUG_ENT("reading %d bits\n", n*8); + + n = extract_entropy_user(&blocking_pool, buf, n); + + DEBUG_ENT("read got %d bits (%d still needed)\n", + n*8, (nbytes-n)*8); + + if (n == 0) { + if (file->f_flags & O_NONBLOCK) { + retval = -EAGAIN; + break; + } + + DEBUG_ENT("sleeping?\n"); + + wait_event_interruptible(random_read_wait, + input_pool.entropy_count >= + random_read_wakeup_thresh); + + DEBUG_ENT("awake\n"); + + if (signal_pending(current)) { + retval = -ERESTARTSYS; + break; + } + + continue; + } + + if (n < 0) { + retval = n; + break; + } + count += n; + buf += n; + nbytes -= n; + break; /* This break makes the device work */ + /* like a named pipe */ + } + + /* + * If we gave the user some bytes, update the access time. + */ + if (count) + file_accessed(file); + + return (count ? count : retval); +} + +static ssize_t +urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) +{ + return extract_entropy_user(&nonblocking_pool, buf, nbytes); +} + +static unsigned int +random_poll(struct file *file, poll_table * wait) +{ + unsigned int mask; + + poll_wait(file, &random_read_wait, wait); + poll_wait(file, &random_write_wait, wait); + mask = 0; + if (input_pool.entropy_count >= random_read_wakeup_thresh) + mask |= POLLIN | POLLRDNORM; + if (input_pool.entropy_count < random_write_wakeup_thresh) + mask |= POLLOUT | POLLWRNORM; + return mask; +} + +static int +write_pool(struct entropy_store *r, const char __user *buffer, size_t count) +{ + size_t bytes; + __u32 buf[16]; + const char __user *p = buffer; + + while (count > 0) { + bytes = min(count, sizeof(buf)); + if (copy_from_user(&buf, p, bytes)) + return -EFAULT; + + count -= bytes; + p += bytes; + + mix_pool_bytes(r, buf, bytes); + cond_resched(); + } + + return 0; +} + +static ssize_t random_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + size_t ret; + struct inode *inode = file->f_path.dentry->d_inode; + + ret = write_pool(&blocking_pool, buffer, count); + if (ret) + return ret; + ret = write_pool(&nonblocking_pool, buffer, count); + if (ret) + return ret; + + inode->i_mtime = current_fs_time(inode->i_sb); + mark_inode_dirty(inode); + return (ssize_t)count; +} + +static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + int size, ent_count; + int __user *p = (int __user *)arg; + int retval; + + switch (cmd) { + case RNDGETENTCNT: + /* inherently racy, no point locking */ + if (put_user(input_pool.entropy_count, p)) + return -EFAULT; + return 0; + case RNDADDTOENTCNT: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(ent_count, p)) + return -EFAULT; + credit_entropy_bits(&input_pool, ent_count); + return 0; + case RNDADDENTROPY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(ent_count, p++)) + return -EFAULT; + if (ent_count < 0) + return -EINVAL; + if (get_user(size, p++)) + return -EFAULT; + retval = write_pool(&input_pool, (const char __user *)p, + size); + if (retval < 0) + return retval; + credit_entropy_bits(&input_pool, ent_count); + return 0; + case RNDZAPENTCNT: + case RNDCLEARPOOL: + /* Clear the entropy pool counters. */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + rand_initialize(); + return 0; + default: + return -EINVAL; + } +} + +static int random_fasync(int fd, struct file *filp, int on) +{ + return fasync_helper(fd, filp, on, &fasync); +} + +const struct file_operations random_fops = { + .read = random_read, + .write = random_write, + .poll = random_poll, + .unlocked_ioctl = random_ioctl, + .fasync = random_fasync, +}; + +const struct file_operations urandom_fops = { + .read = urandom_read, + .write = random_write, + .unlocked_ioctl = random_ioctl, + .fasync = random_fasync, +}; + +/*************************************************************** + * Random UUID interface + * + * Used here for a Boot ID, but can be useful for other kernel + * drivers. + ***************************************************************/ + +/* + * Generate random UUID + */ +void generate_random_uuid(unsigned char uuid_out[16]) +{ + get_random_bytes(uuid_out, 16); + /* Set UUID version to 4 --- truely random generation */ + uuid_out[6] = (uuid_out[6] & 0x0F) | 0x40; + /* Set the UUID variant to DCE */ + uuid_out[8] = (uuid_out[8] & 0x3F) | 0x80; +} +EXPORT_SYMBOL(generate_random_uuid); + +/******************************************************************** + * + * Sysctl interface + * + ********************************************************************/ + +#ifdef CONFIG_SYSCTL + +#include <linux/sysctl.h> + +static int min_read_thresh = 8, min_write_thresh; +static int max_read_thresh = INPUT_POOL_WORDS * 32; +static int max_write_thresh = INPUT_POOL_WORDS * 32; +static char sysctl_bootid[16]; + +/* + * These functions is used to return both the bootid UUID, and random + * UUID. The difference is in whether table->data is NULL; if it is, + * then a new UUID is generated and returned to the user. + * + * If the user accesses this via the proc interface, it will be returned + * as an ASCII string in the standard UUID format. If accesses via the + * sysctl system call, it is returned as 16 bytes of binary data. + */ +static int proc_do_uuid(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + ctl_table fake_table; + unsigned char buf[64], tmp_uuid[16], *uuid; + + uuid = table->data; + if (!uuid) { + uuid = tmp_uuid; + uuid[8] = 0; + } + if (uuid[8] == 0) + generate_random_uuid(uuid); + + sprintf(buf, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" + "%02x%02x%02x%02x%02x%02x", + uuid[0], uuid[1], uuid[2], uuid[3], + uuid[4], uuid[5], uuid[6], uuid[7], + uuid[8], uuid[9], uuid[10], uuid[11], + uuid[12], uuid[13], uuid[14], uuid[15]); + fake_table.data = buf; + fake_table.maxlen = sizeof(buf); + + return proc_dostring(&fake_table, write, filp, buffer, lenp, ppos); +} + +static int uuid_strategy(ctl_table *table, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + unsigned char tmp_uuid[16], *uuid; + unsigned int len; + + if (!oldval || !oldlenp) + return 1; + + uuid = table->data; + if (!uuid) { + uuid = tmp_uuid; + uuid[8] = 0; + } + if (uuid[8] == 0) + generate_random_uuid(uuid); + + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > 16) + len = 16; + if (copy_to_user(oldval, uuid, len) || + put_user(len, oldlenp)) + return -EFAULT; + } + return 1; +} + +static int sysctl_poolsize = INPUT_POOL_WORDS * 32; +ctl_table random_table[] = { + { + .ctl_name = RANDOM_POOLSIZE, + .procname = "poolsize", + .data = &sysctl_poolsize, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = RANDOM_ENTROPY_COUNT, + .procname = "entropy_avail", + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .data = &input_pool.entropy_count, + }, + { + .ctl_name = RANDOM_READ_THRESH, + .procname = "read_wakeup_threshold", + .data = &random_read_wakeup_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_read_thresh, + .extra2 = &max_read_thresh, + }, + { + .ctl_name = RANDOM_WRITE_THRESH, + .procname = "write_wakeup_threshold", + .data = &random_write_wakeup_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_write_thresh, + .extra2 = &max_write_thresh, + }, + { + .ctl_name = RANDOM_BOOT_ID, + .procname = "boot_id", + .data = &sysctl_bootid, + .maxlen = 16, + .mode = 0444, + .proc_handler = &proc_do_uuid, + .strategy = &uuid_strategy, + }, + { + .ctl_name = RANDOM_UUID, + .procname = "uuid", + .maxlen = 16, + .mode = 0444, + .proc_handler = &proc_do_uuid, + .strategy = &uuid_strategy, + }, + { .ctl_name = 0 } +}; +#endif /* CONFIG_SYSCTL */ + +/******************************************************************** + * + * Random funtions for networking + * + ********************************************************************/ + +/* + * TCP initial sequence number picking. This uses the random number + * generator to pick an initial secret value. This value is hashed + * along with the TCP endpoint information to provide a unique + * starting point for each pair of TCP endpoints. This defeats + * attacks which rely on guessing the initial TCP sequence number. + * This algorithm was suggested by Steve Bellovin. + * + * Using a very strong hash was taking an appreciable amount of the total + * TCP connection establishment time, so this is a weaker hash, + * compensated for by changing the secret periodically. + */ + +/* F, G and H are basic MD4 functions: selection, majority, parity */ +#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) + +/* + * The generic round function. The application is so specific that + * we don't bother protecting all the arguments with parens, as is generally + * good macro practice, in favor of extra legibility. + * Rotation is separate from addition to prevent recomputation + */ +#define ROUND(f, a, b, c, d, x, s) \ + (a += f(b, c, d) + x, a = (a << s) | (a >> (32 - s))) +#define K1 0 +#define K2 013240474631UL +#define K3 015666365641UL + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + +static __u32 twothirdsMD4Transform(__u32 const buf[4], __u32 const in[12]) +{ + __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; + + /* Round 1 */ + ROUND(F, a, b, c, d, in[ 0] + K1, 3); + ROUND(F, d, a, b, c, in[ 1] + K1, 7); + ROUND(F, c, d, a, b, in[ 2] + K1, 11); + ROUND(F, b, c, d, a, in[ 3] + K1, 19); + ROUND(F, a, b, c, d, in[ 4] + K1, 3); + ROUND(F, d, a, b, c, in[ 5] + K1, 7); + ROUND(F, c, d, a, b, in[ 6] + K1, 11); + ROUND(F, b, c, d, a, in[ 7] + K1, 19); + ROUND(F, a, b, c, d, in[ 8] + K1, 3); + ROUND(F, d, a, b, c, in[ 9] + K1, 7); + ROUND(F, c, d, a, b, in[10] + K1, 11); + ROUND(F, b, c, d, a, in[11] + K1, 19); + + /* Round 2 */ + ROUND(G, a, b, c, d, in[ 1] + K2, 3); + ROUND(G, d, a, b, c, in[ 3] + K2, 5); + ROUND(G, c, d, a, b, in[ 5] + K2, 9); + ROUND(G, b, c, d, a, in[ 7] + K2, 13); + ROUND(G, a, b, c, d, in[ 9] + K2, 3); + ROUND(G, d, a, b, c, in[11] + K2, 5); + ROUND(G, c, d, a, b, in[ 0] + K2, 9); + ROUND(G, b, c, d, a, in[ 2] + K2, 13); + ROUND(G, a, b, c, d, in[ 4] + K2, 3); + ROUND(G, d, a, b, c, in[ 6] + K2, 5); + ROUND(G, c, d, a, b, in[ 8] + K2, 9); + ROUND(G, b, c, d, a, in[10] + K2, 13); + + /* Round 3 */ + ROUND(H, a, b, c, d, in[ 3] + K3, 3); + ROUND(H, d, a, b, c, in[ 7] + K3, 9); + ROUND(H, c, d, a, b, in[11] + K3, 11); + ROUND(H, b, c, d, a, in[ 2] + K3, 15); + ROUND(H, a, b, c, d, in[ 6] + K3, 3); + ROUND(H, d, a, b, c, in[10] + K3, 9); + ROUND(H, c, d, a, b, in[ 1] + K3, 11); + ROUND(H, b, c, d, a, in[ 5] + K3, 15); + ROUND(H, a, b, c, d, in[ 9] + K3, 3); + ROUND(H, d, a, b, c, in[ 0] + K3, 9); + ROUND(H, c, d, a, b, in[ 4] + K3, 11); + ROUND(H, b, c, d, a, in[ 8] + K3, 15); + + return buf[1] + b; /* "most hashed" word */ + /* Alternative: return sum of all words? */ +} +#endif + +#undef ROUND +#undef F +#undef G +#undef H +#undef K1 +#undef K2 +#undef K3 + +/* This should not be decreased so low that ISNs wrap too fast. */ +#define REKEY_INTERVAL (300 * HZ) +/* + * Bit layout of the tcp sequence numbers (before adding current time): + * bit 24-31: increased after every key exchange + * bit 0-23: hash(source,dest) + * + * The implementation is similar to the algorithm described + * in the Appendix of RFC 1185, except that + * - it uses a 1 MHz clock instead of a 250 kHz clock + * - it performs a rekey every 5 minutes, which is equivalent + * to a (source,dest) tulple dependent forward jump of the + * clock by 0..2^(HASH_BITS+1) + * + * Thus the average ISN wraparound time is 68 minutes instead of + * 4.55 hours. + * + * SMP cleanup and lock avoidance with poor man's RCU. + * Manfred Spraul <manfred@colorfullife.com> + * + */ +#define COUNT_BITS 8 +#define COUNT_MASK ((1 << COUNT_BITS) - 1) +#define HASH_BITS 24 +#define HASH_MASK ((1 << HASH_BITS) - 1) + +static struct keydata { + __u32 count; /* already shifted to the final position */ + __u32 secret[12]; +} ____cacheline_aligned ip_keydata[2]; + +static unsigned int ip_cnt; + +static void rekey_seq_generator(struct work_struct *work); + +static DECLARE_DELAYED_WORK(rekey_work, rekey_seq_generator); + +/* + * Lock avoidance: + * The ISN generation runs lockless - it's just a hash over random data. + * State changes happen every 5 minutes when the random key is replaced. + * Synchronization is performed by having two copies of the hash function + * state and rekey_seq_generator always updates the inactive copy. + * The copy is then activated by updating ip_cnt. + * The implementation breaks down if someone blocks the thread + * that processes SYN requests for more than 5 minutes. Should never + * happen, and even if that happens only a not perfectly compliant + * ISN is generated, nothing fatal. + */ +static void rekey_seq_generator(struct work_struct *work) +{ + struct keydata *keyptr = &ip_keydata[1 ^ (ip_cnt & 1)]; + + get_random_bytes(keyptr->secret, sizeof(keyptr->secret)); + keyptr->count = (ip_cnt & COUNT_MASK) << HASH_BITS; + smp_wmb(); + ip_cnt++; + schedule_delayed_work(&rekey_work, REKEY_INTERVAL); +} + +static inline struct keydata *get_keyptr(void) +{ + struct keydata *keyptr = &ip_keydata[ip_cnt & 1]; + + smp_rmb(); + + return keyptr; +} + +static __init int seqgen_init(void) +{ + rekey_seq_generator(NULL); + return 0; +} +late_initcall(seqgen_init); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +__u32 secure_tcpv6_sequence_number(__be32 *saddr, __be32 *daddr, + __be16 sport, __be16 dport) +{ + __u32 seq; + __u32 hash[12]; + struct keydata *keyptr = get_keyptr(); + + /* The procedure is the same as for IPv4, but addresses are longer. + * Thus we must use twothirdsMD4Transform. + */ + + memcpy(hash, saddr, 16); + hash[4] = ((__force u16)sport << 16) + (__force u16)dport; + memcpy(&hash[5], keyptr->secret, sizeof(__u32) * 7); + + seq = twothirdsMD4Transform((const __u32 *)daddr, hash) & HASH_MASK; + seq += keyptr->count; + + seq += ktime_to_ns(ktime_get_real()); + + return seq; +} +EXPORT_SYMBOL(secure_tcpv6_sequence_number); +#endif + +/* The code below is shamelessly stolen from secure_tcp_sequence_number(). + * All blames to Andrey V. Savochkin <saw@msu.ru>. + */ +__u32 secure_ip_id(__be32 daddr) +{ + struct keydata *keyptr; + __u32 hash[4]; + + keyptr = get_keyptr(); + + /* + * Pick a unique starting offset for each IP destination. + * The dest ip address is placed in the starting vector, + * which is then hashed with random data. + */ + hash[0] = (__force __u32)daddr; + hash[1] = keyptr->secret[9]; + hash[2] = keyptr->secret[10]; + hash[3] = keyptr->secret[11]; + + return half_md4_transform(hash, keyptr->secret); +} + +#ifdef CONFIG_INET + +__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + __u32 seq; + __u32 hash[4]; + struct keydata *keyptr = get_keyptr(); + + /* + * Pick a unique starting offset for each TCP connection endpoints + * (saddr, daddr, sport, dport). + * Note that the words are placed into the starting vector, which is + * then mixed with a partial MD4 over random data. + */ + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = ((__force u16)sport << 16) + (__force u16)dport; + hash[3] = keyptr->secret[11]; + + seq = half_md4_transform(hash, keyptr->secret) & HASH_MASK; + seq += keyptr->count; + /* + * As close as possible to RFC 793, which + * suggests using a 250 kHz clock. + * Further reading shows this assumes 2 Mb/s networks. + * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate. + * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but + * we also need to limit the resolution so that the u32 seq + * overlaps less than one time per MSL (2 minutes). + * Choosing a clock of 64 ns period is OK. (period of 274 s) + */ + seq += ktime_to_ns(ktime_get_real()) >> 6; + + return seq; +} + +/* Generate secure starting point for ephemeral IPV4 transport port search */ +u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) +{ + struct keydata *keyptr = get_keyptr(); + u32 hash[4]; + + /* + * Pick a unique starting offset for each ephemeral port search + * (saddr, daddr, dport) and 48bits of random data. + */ + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = (__force u32)dport ^ keyptr->secret[10]; + hash[3] = keyptr->secret[11]; + + return half_md4_transform(hash, keyptr->secret); +} +EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, + __be16 dport) +{ + struct keydata *keyptr = get_keyptr(); + u32 hash[12]; + + memcpy(hash, saddr, 16); + hash[4] = (__force u32)dport; + memcpy(&hash[5], keyptr->secret, sizeof(__u32) * 7); + + return twothirdsMD4Transform((const __u32 *)daddr, hash); +} +#endif + +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +/* Similar to secure_tcp_sequence_number but generate a 48 bit value + * bit's 32-47 increase every key exchange + * 0-31 hash(source, dest) + */ +u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + u64 seq; + __u32 hash[4]; + struct keydata *keyptr = get_keyptr(); + + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = ((__force u16)sport << 16) + (__force u16)dport; + hash[3] = keyptr->secret[11]; + + seq = half_md4_transform(hash, keyptr->secret); + seq |= ((u64)keyptr->count) << (32 - HASH_BITS); + + seq += ktime_to_ns(ktime_get_real()); + seq &= (1ull << 48) - 1; + + return seq; +} +EXPORT_SYMBOL(secure_dccp_sequence_number); +#endif + +#endif /* CONFIG_INET */ + + +/* + * Get a random word for internal kernel use only. Similar to urandom but + * with the goal of minimal entropy pool depletion. As a result, the random + * value is not cryptographically secure but for several uses the cost of + * depleting entropy is too high + */ +unsigned int get_random_int(void) +{ + /* + * Use IP's RNG. It suits our purpose perfectly: it re-keys itself + * every second, from the entropy pool (and thus creates a limited + * drain on it), and uses halfMD4Transform within the second. We + * also mix it with jiffies and the PID: + */ + return secure_ip_id((__force __be32)(current->pid + jiffies)); +} + +/* + * randomize_range() returns a start address such that + * + * [...... <range> .....] + * start end + * + * a <range> with size "len" starting at the return value is inside in the + * area defined by [start, end], but is otherwise randomized. + */ +unsigned long +randomize_range(unsigned long start, unsigned long end, unsigned long len) +{ + unsigned long range = end - len - start; + + if (end <= start + len) + return 0; + return PAGE_ALIGN(get_random_int() % range + start); +} + +#endif diff --git a/libdde-linux26/lib/src/drivers/pci/pci-driver.c b/libdde-linux26/lib/src/drivers/pci/pci-driver.c new file mode 100644 index 00000000..199ec8a7 --- /dev/null +++ b/libdde-linux26/lib/src/drivers/pci/pci-driver.c @@ -0,0 +1,1008 @@ +/* + * drivers/pci/pci-driver.c + * + * (C) Copyright 2002-2004, 2007 Greg Kroah-Hartman <greg@kroah.com> + * (C) Copyright 2007 Novell Inc. + * + * Released under the GPL v2 only. + * + */ + +#include <linux/pci.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/device.h> +#include <linux/mempolicy.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/cpu.h> +#include "pci.h" + +#ifdef DDE_LINUX +#include "local.h" +#endif /* DDE_LINUX */ + +/* + * Dynamic device IDs are disabled for !CONFIG_HOTPLUG + */ + +struct pci_dynid { + struct list_head node; + struct pci_device_id id; +}; + +#ifdef CONFIG_HOTPLUG + +/** + * store_new_id - add a new PCI device ID to this driver and re-probe devices + * @driver: target device driver + * @buf: buffer for scanning device ID data + * @count: input size + * + * Adds a new dynamic pci device ID to this driver, + * and causes the driver to probe for all devices again. + */ +static ssize_t +store_new_id(struct device_driver *driver, const char *buf, size_t count) +{ + struct pci_dynid *dynid; + struct pci_driver *pdrv = to_pci_driver(driver); + const struct pci_device_id *ids = pdrv->id_table; + __u32 vendor, device, subvendor=PCI_ANY_ID, + subdevice=PCI_ANY_ID, class=0, class_mask=0; + unsigned long driver_data=0; + int fields=0; + int retval=0; + + fields = sscanf(buf, "%x %x %x %x %x %x %lx", + &vendor, &device, &subvendor, &subdevice, + &class, &class_mask, &driver_data); + if (fields < 2) + return -EINVAL; + + /* Only accept driver_data values that match an existing id_table + entry */ + if (ids) { + retval = -EINVAL; + while (ids->vendor || ids->subvendor || ids->class_mask) { + if (driver_data == ids->driver_data) { + retval = 0; + break; + } + ids++; + } + if (retval) /* No match */ + return retval; + } + + dynid = kzalloc(sizeof(*dynid), GFP_KERNEL); + if (!dynid) + return -ENOMEM; + + dynid->id.vendor = vendor; + dynid->id.device = device; + dynid->id.subvendor = subvendor; + dynid->id.subdevice = subdevice; + dynid->id.class = class; + dynid->id.class_mask = class_mask; + dynid->id.driver_data = driver_data; + + spin_lock(&pdrv->dynids.lock); + list_add_tail(&dynid->node, &pdrv->dynids.list); + spin_unlock(&pdrv->dynids.lock); + + if (get_driver(&pdrv->driver)) { + retval = driver_attach(&pdrv->driver); + put_driver(&pdrv->driver); + } + + if (retval) + return retval; + return count; +} +static DRIVER_ATTR(new_id, S_IWUSR, NULL, store_new_id); + +static void +pci_free_dynids(struct pci_driver *drv) +{ + struct pci_dynid *dynid, *n; + + spin_lock(&drv->dynids.lock); + list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) { + list_del(&dynid->node); + kfree(dynid); + } + spin_unlock(&drv->dynids.lock); +} + +static int +pci_create_newid_file(struct pci_driver *drv) +{ + int error = 0; + if (drv->probe != NULL) + error = driver_create_file(&drv->driver, &driver_attr_new_id); + return error; +} + +static void pci_remove_newid_file(struct pci_driver *drv) +{ + driver_remove_file(&drv->driver, &driver_attr_new_id); +} +#else /* !CONFIG_HOTPLUG */ +static inline void pci_free_dynids(struct pci_driver *drv) {} +static inline int pci_create_newid_file(struct pci_driver *drv) +{ + return 0; +} +static inline void pci_remove_newid_file(struct pci_driver *drv) {} +#endif + +/** + * pci_match_id - See if a pci device matches a given pci_id table + * @ids: array of PCI device id structures to search in + * @dev: the PCI device structure to match against. + * + * Used by a driver to check whether a PCI device present in the + * system is in its list of supported devices. Returns the matching + * pci_device_id structure or %NULL if there is no match. + * + * Deprecated, don't use this as it will not catch any dynamic ids + * that a driver might want to check for. + */ +const struct pci_device_id *pci_match_id(const struct pci_device_id *ids, + struct pci_dev *dev) +{ + if (ids) { + while (ids->vendor || ids->subvendor || ids->class_mask) { + if (pci_match_one_device(ids, dev)) + return ids; + ids++; + } + } + return NULL; +} + +/** + * pci_match_device - Tell if a PCI device structure has a matching PCI device id structure + * @drv: the PCI driver to match against + * @dev: the PCI device structure to match against + * + * Used by a driver to check whether a PCI device present in the + * system is in its list of supported devices. Returns the matching + * pci_device_id structure or %NULL if there is no match. + */ +static const struct pci_device_id *pci_match_device(struct pci_driver *drv, + struct pci_dev *dev) +{ + struct pci_dynid *dynid; + + /* Look at the dynamic ids first, before the static ones */ + spin_lock(&drv->dynids.lock); + list_for_each_entry(dynid, &drv->dynids.list, node) { + if (pci_match_one_device(&dynid->id, dev)) { + spin_unlock(&drv->dynids.lock); + return &dynid->id; + } + } + spin_unlock(&drv->dynids.lock); + + return pci_match_id(drv->id_table, dev); +} + +struct drv_dev_and_id { + struct pci_driver *drv; + struct pci_dev *dev; + const struct pci_device_id *id; +}; + +static long local_pci_probe(void *_ddi) +{ + struct drv_dev_and_id *ddi = _ddi; + + return ddi->drv->probe(ddi->dev, ddi->id); +} + +static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, + const struct pci_device_id *id) +{ + int error, node; + struct drv_dev_and_id ddi = { drv, dev, id }; + + /* Execute driver initialization on node where the device's + bus is attached to. This way the driver likely allocates + its local memory on the right node without any need to + change it. */ + node = dev_to_node(&dev->dev); + if (node >= 0) { + int cpu; + node_to_cpumask_ptr(nodecpumask, node); + + get_online_cpus(); + cpu = cpumask_any_and(nodecpumask, cpu_online_mask); + if (cpu < nr_cpu_ids) + error = work_on_cpu(cpu, local_pci_probe, &ddi); + else + error = local_pci_probe(&ddi); + put_online_cpus(); + } else + error = local_pci_probe(&ddi); + return error; +} + +/** + * __pci_device_probe() + * @drv: driver to call to check if it wants the PCI device + * @pci_dev: PCI device being probed + * + * returns 0 on success, else error. + * side-effect: pci_dev->driver is set to drv when drv claims pci_dev. + */ +static int +__pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev) +{ + const struct pci_device_id *id; + int error = 0; + + if (!pci_dev->driver && drv->probe) { + error = -ENODEV; + + id = pci_match_device(drv, pci_dev); + if (id) + error = pci_call_probe(drv, pci_dev, id); + if (error >= 0) { + pci_dev->driver = drv; + error = 0; + } + } + return error; +} + +static int pci_device_probe(struct device * dev) +{ + int error = 0; + struct pci_driver *drv; + struct pci_dev *pci_dev; + + drv = to_pci_driver(dev->driver); + pci_dev = to_pci_dev(dev); + pci_dev_get(pci_dev); + error = __pci_device_probe(drv, pci_dev); + if (error) + pci_dev_put(pci_dev); + + return error; +} + +static int pci_device_remove(struct device * dev) +{ + struct pci_dev * pci_dev = to_pci_dev(dev); + struct pci_driver * drv = pci_dev->driver; + + if (drv) { + if (drv->remove) + drv->remove(pci_dev); + pci_dev->driver = NULL; + } + + /* + * If the device is still on, set the power state as "unknown", + * since it might change by the next time we load the driver. + */ + if (pci_dev->current_state == PCI_D0) + pci_dev->current_state = PCI_UNKNOWN; + + /* + * We would love to complain here if pci_dev->is_enabled is set, that + * the driver should have called pci_disable_device(), but the + * unfortunate fact is there are too many odd BIOS and bridge setups + * that don't like drivers doing that all of the time. + * Oh well, we can dream of sane hardware when we sleep, no matter how + * horrible the crap we have to deal with is when we are awake... + */ + + pci_dev_put(pci_dev); + return 0; +} + +static void pci_device_shutdown(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct pci_driver *drv = pci_dev->driver; + + if (drv && drv->shutdown) + drv->shutdown(pci_dev); + pci_msi_shutdown(pci_dev); + pci_msix_shutdown(pci_dev); +} + +#ifdef CONFIG_PM_SLEEP + +/* + * Default "suspend" method for devices that have no driver provided suspend, + * or not even a driver at all (second part). + */ +static void pci_pm_set_unknown_state(struct pci_dev *pci_dev) +{ + /* + * mark its power state as "unknown", since we don't know if + * e.g. the BIOS will change its device state when we suspend. + */ + if (pci_dev->current_state == PCI_D0) + pci_dev->current_state = PCI_UNKNOWN; +} + +/* + * Default "resume" method for devices that have no driver provided resume, + * or not even a driver at all (second part). + */ +static int pci_pm_reenable_device(struct pci_dev *pci_dev) +{ + int retval; + + /* if the device was enabled before suspend, reenable */ + retval = pci_reenable_device(pci_dev); + /* + * if the device was busmaster before the suspend, make it busmaster + * again + */ + if (pci_dev->is_busmaster) + pci_set_master(pci_dev); + + return retval; +} + +static int pci_legacy_suspend(struct device *dev, pm_message_t state) +{ +#ifndef DDE_LINUX + struct pci_dev * pci_dev = to_pci_dev(dev); + struct pci_driver * drv = pci_dev->driver; + int i = 0; + + if (drv && drv->suspend) { + pci_power_t prev = pci_dev->current_state; + + pci_dev->state_saved = false; + + i = drv->suspend(pci_dev, state); + suspend_report_result(drv->suspend, i); + if (i) + return i; + + if (pci_dev->state_saved) + goto Fixup; + + if (pci_dev->current_state != PCI_D0 + && pci_dev->current_state != PCI_UNKNOWN) { + WARN_ONCE(pci_dev->current_state != prev, + "PCI PM: Device state not saved by %pF\n", + drv->suspend); + goto Fixup; + } + } + + pci_save_state(pci_dev); + /* + * This is for compatibility with existing code with legacy PM support. + */ + pci_pm_set_unknown_state(pci_dev); + + Fixup: + pci_fixup_device(pci_fixup_suspend, pci_dev); + + return i; +#else + WARN_UNIMPL; + return 0; +#endif /* DDE_LINUX */ +} + +static int pci_legacy_suspend_late(struct device *dev, pm_message_t state) +{ +#ifndef DDE_LINUX + struct pci_dev * pci_dev = to_pci_dev(dev); + struct pci_driver * drv = pci_dev->driver; + int i = 0; + + if (drv && drv->suspend_late) { + i = drv->suspend_late(pci_dev, state); + suspend_report_result(drv->suspend_late, i); + } + return i; +#else + WARN_UNIMPL; + return 0; +#endif +} + +static int pci_legacy_resume_early(struct device *dev) +{ + struct pci_dev * pci_dev = to_pci_dev(dev); + struct pci_driver * drv = pci_dev->driver; + + return drv && drv->resume_early ? + drv->resume_early(pci_dev) : 0; +} + +static int pci_legacy_resume(struct device *dev) +{ + struct pci_dev * pci_dev = to_pci_dev(dev); + struct pci_driver * drv = pci_dev->driver; + + pci_fixup_device(pci_fixup_resume, pci_dev); + + return drv && drv->resume ? + drv->resume(pci_dev) : pci_pm_reenable_device(pci_dev); +} + +/* Auxiliary functions used by the new power management framework */ + +static void pci_pm_default_resume_noirq(struct pci_dev *pci_dev) +{ + pci_restore_standard_config(pci_dev); + pci_dev->state_saved = false; + pci_fixup_device(pci_fixup_resume_early, pci_dev); +} + +static void pci_pm_default_resume(struct pci_dev *pci_dev) +{ + pci_fixup_device(pci_fixup_resume, pci_dev); + + if (!pci_is_bridge(pci_dev)) + pci_enable_wake(pci_dev, PCI_D0, false); +} + +static void pci_pm_default_suspend(struct pci_dev *pci_dev) +{ + /* Disable non-bridge devices without PM support */ + if (!pci_is_bridge(pci_dev)) + pci_disable_enabled_device(pci_dev); + pci_save_state(pci_dev); +} + +static bool pci_has_legacy_pm_support(struct pci_dev *pci_dev) +{ + struct pci_driver *drv = pci_dev->driver; + bool ret = drv && (drv->suspend || drv->suspend_late || drv->resume + || drv->resume_early); + + /* + * Legacy PM support is used by default, so warn if the new framework is + * supported as well. Drivers are supposed to support either the + * former, or the latter, but not both at the same time. + */ + WARN_ON(ret && drv->driver.pm); + + return ret; +} + +/* New power management framework */ + +static int pci_pm_prepare(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int error = 0; + + if (drv && drv->pm && drv->pm->prepare) + error = drv->pm->prepare(dev); + + return error; +} + +static void pci_pm_complete(struct device *dev) +{ + struct device_driver *drv = dev->driver; + + if (drv && drv->pm && drv->pm->complete) + drv->pm->complete(dev); +} + +#ifdef CONFIG_SUSPEND + +static int pci_pm_suspend(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_suspend(dev, PMSG_SUSPEND); + + if (!pm) { + pci_pm_default_suspend(pci_dev); + goto Fixup; + } + + pci_dev->state_saved = false; + + if (pm->suspend) { + pci_power_t prev = pci_dev->current_state; + int error; + + error = pm->suspend(dev); + suspend_report_result(pm->suspend, error); + if (error) + return error; + + if (pci_dev->state_saved) + goto Fixup; + + if (pci_dev->current_state != PCI_D0 + && pci_dev->current_state != PCI_UNKNOWN) { + WARN_ONCE(pci_dev->current_state != prev, + "PCI PM: State of device not saved by %pF\n", + pm->suspend); + goto Fixup; + } + } + + if (!pci_dev->state_saved) { + pci_save_state(pci_dev); + if (!pci_is_bridge(pci_dev)) + pci_prepare_to_sleep(pci_dev); + } + + Fixup: + pci_fixup_device(pci_fixup_suspend, pci_dev); + + return 0; +} + +static int pci_pm_suspend_noirq(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct device_driver *drv = dev->driver; + int error = 0; + + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_suspend_late(dev, PMSG_SUSPEND); + + if (drv && drv->pm && drv->pm->suspend_noirq) { + error = drv->pm->suspend_noirq(dev); + suspend_report_result(drv->pm->suspend_noirq, error); + } + + if (!error) + pci_pm_set_unknown_state(pci_dev); + + return error; +} + +static int pci_pm_resume_noirq(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct device_driver *drv = dev->driver; + int error = 0; + + pci_pm_default_resume_noirq(pci_dev); + + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_resume_early(dev); + + if (drv && drv->pm && drv->pm->resume_noirq) + error = drv->pm->resume_noirq(dev); + + return error; +} + +static int pci_pm_resume(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + int error = 0; + + /* + * This is necessary for the suspend error path in which resume is + * called without restoring the standard config registers of the device. + */ + if (pci_dev->state_saved) + pci_restore_standard_config(pci_dev); + + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_resume(dev); + + pci_pm_default_resume(pci_dev); + + if (pm) { + if (pm->resume) + error = pm->resume(dev); + } else { + pci_pm_reenable_device(pci_dev); + } + + return 0; +} + +#else /* !CONFIG_SUSPEND */ + +#define pci_pm_suspend NULL +#define pci_pm_suspend_noirq NULL +#define pci_pm_resume NULL +#define pci_pm_resume_noirq NULL + +#endif /* !CONFIG_SUSPEND */ + +#ifdef CONFIG_HIBERNATION + +static int pci_pm_freeze(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_suspend(dev, PMSG_FREEZE); + + if (!pm) { + pci_pm_default_suspend(pci_dev); + return 0; + } + + pci_dev->state_saved = false; + + if (pm->freeze) { + int error; + + error = pm->freeze(dev); + suspend_report_result(pm->freeze, error); + if (error) + return error; + } + + if (!pci_dev->state_saved) + pci_save_state(pci_dev); + + return 0; +} + +static int pci_pm_freeze_noirq(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct device_driver *drv = dev->driver; + int error = 0; + + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_suspend_late(dev, PMSG_FREEZE); + + if (drv && drv->pm && drv->pm->freeze_noirq) { + error = drv->pm->freeze_noirq(dev); + suspend_report_result(drv->pm->freeze_noirq, error); + } + + if (!error) + pci_pm_set_unknown_state(pci_dev); + + return error; +} + +static int pci_pm_thaw_noirq(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct device_driver *drv = dev->driver; + int error = 0; + + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_resume_early(dev); + + pci_update_current_state(pci_dev, PCI_D0); + + if (drv && drv->pm && drv->pm->thaw_noirq) + error = drv->pm->thaw_noirq(dev); + + return error; +} + +static int pci_pm_thaw(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + int error = 0; + + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_resume(dev); + + if (pm) { + if (pm->thaw) + error = pm->thaw(dev); + } else { + pci_pm_reenable_device(pci_dev); + } + + return error; +} + +static int pci_pm_poweroff(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + int error = 0; + + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_suspend(dev, PMSG_HIBERNATE); + + if (!pm) { + pci_pm_default_suspend(pci_dev); + goto Fixup; + } + + pci_dev->state_saved = false; + + if (pm->poweroff) { + error = pm->poweroff(dev); + suspend_report_result(pm->poweroff, error); + } + + if (!pci_dev->state_saved && !pci_is_bridge(pci_dev)) + pci_prepare_to_sleep(pci_dev); + + Fixup: + pci_fixup_device(pci_fixup_suspend, pci_dev); + + return error; +} + +static int pci_pm_poweroff_noirq(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int error = 0; + + if (pci_has_legacy_pm_support(to_pci_dev(dev))) + return pci_legacy_suspend_late(dev, PMSG_HIBERNATE); + + if (drv && drv->pm && drv->pm->poweroff_noirq) { + error = drv->pm->poweroff_noirq(dev); + suspend_report_result(drv->pm->poweroff_noirq, error); + } + + return error; +} + +static int pci_pm_restore_noirq(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct device_driver *drv = dev->driver; + int error = 0; + + pci_pm_default_resume_noirq(pci_dev); + + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_resume_early(dev); + + if (drv && drv->pm && drv->pm->restore_noirq) + error = drv->pm->restore_noirq(dev); + + return error; +} + +static int pci_pm_restore(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + int error = 0; + + /* + * This is necessary for the hibernation error path in which restore is + * called without restoring the standard config registers of the device. + */ + if (pci_dev->state_saved) + pci_restore_standard_config(pci_dev); + + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_resume(dev); + + pci_pm_default_resume(pci_dev); + + if (pm) { + if (pm->restore) + error = pm->restore(dev); + } else { + pci_pm_reenable_device(pci_dev); + } + + return error; +} + +#else /* !CONFIG_HIBERNATION */ + +#define pci_pm_freeze NULL +#define pci_pm_freeze_noirq NULL +#define pci_pm_thaw NULL +#define pci_pm_thaw_noirq NULL +#define pci_pm_poweroff NULL +#define pci_pm_poweroff_noirq NULL +#define pci_pm_restore NULL +#define pci_pm_restore_noirq NULL + +#endif /* !CONFIG_HIBERNATION */ + +struct dev_pm_ops pci_dev_pm_ops = { + .prepare = pci_pm_prepare, + .complete = pci_pm_complete, + .suspend = pci_pm_suspend, + .resume = pci_pm_resume, + .freeze = pci_pm_freeze, + .thaw = pci_pm_thaw, + .poweroff = pci_pm_poweroff, + .restore = pci_pm_restore, + .suspend_noirq = pci_pm_suspend_noirq, + .resume_noirq = pci_pm_resume_noirq, + .freeze_noirq = pci_pm_freeze_noirq, + .thaw_noirq = pci_pm_thaw_noirq, + .poweroff_noirq = pci_pm_poweroff_noirq, + .restore_noirq = pci_pm_restore_noirq, +}; + +#define PCI_PM_OPS_PTR (&pci_dev_pm_ops) + +#else /* !CONFIG_PM_SLEEP */ + +#define PCI_PM_OPS_PTR NULL + +#endif /* !CONFIG_PM_SLEEP */ + +/** + * __pci_register_driver - register a new pci driver + * @drv: the driver structure to register + * @owner: owner module of drv + * @mod_name: module name string + * + * Adds the driver structure to the list of registered drivers. + * Returns a negative value on error, otherwise 0. + * If no error occurred, the driver remains registered even if + * no device was claimed during registration. + */ +int __pci_register_driver(struct pci_driver *drv, struct module *owner, + const char *mod_name) +{ + int error; + + /* initialize common driver fields */ + drv->driver.name = drv->name; + drv->driver.bus = &pci_bus_type; + drv->driver.owner = owner; + drv->driver.mod_name = mod_name; + + spin_lock_init(&drv->dynids.lock); + INIT_LIST_HEAD(&drv->dynids.list); + + /* register with core */ + error = driver_register(&drv->driver); + if (error) + return error; + + error = pci_create_newid_file(drv); + if (error) + driver_unregister(&drv->driver); + + return error; +} + +/** + * pci_unregister_driver - unregister a pci driver + * @drv: the driver structure to unregister + * + * Deletes the driver structure from the list of registered PCI drivers, + * gives it a chance to clean up by calling its remove() function for + * each device it was responsible for, and marks those devices as + * driverless. + */ + +void +pci_unregister_driver(struct pci_driver *drv) +{ + pci_remove_newid_file(drv); + driver_unregister(&drv->driver); + pci_free_dynids(drv); +} + +static struct pci_driver pci_compat_driver = { + .name = "compat" +}; + +/** + * pci_dev_driver - get the pci_driver of a device + * @dev: the device to query + * + * Returns the appropriate pci_driver structure or %NULL if there is no + * registered driver for the device. + */ +struct pci_driver * +pci_dev_driver(const struct pci_dev *dev) +{ + if (dev->driver) + return dev->driver; + else { + int i; + for(i=0; i<=PCI_ROM_RESOURCE; i++) + if (dev->resource[i].flags & IORESOURCE_BUSY) + return &pci_compat_driver; + } + return NULL; +} + +/** + * pci_bus_match - Tell if a PCI device structure has a matching PCI device id structure + * @dev: the PCI device structure to match against + * @drv: the device driver to search for matching PCI device id structures + * + * Used by a driver to check whether a PCI device present in the + * system is in its list of supported devices. Returns the matching + * pci_device_id structure or %NULL if there is no match. + */ +static int pci_bus_match(struct device *dev, struct device_driver *drv) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + struct pci_driver *pci_drv = to_pci_driver(drv); + const struct pci_device_id *found_id; + + found_id = pci_match_device(pci_drv, pci_dev); + if (found_id) + return 1; + + return 0; +} + +/** + * pci_dev_get - increments the reference count of the pci device structure + * @dev: the device being referenced + * + * Each live reference to a device should be refcounted. + * + * Drivers for PCI devices should normally record such references in + * their probe() methods, when they bind to a device, and release + * them by calling pci_dev_put(), in their disconnect() methods. + * + * A pointer to the device with the incremented reference counter is returned. + */ +struct pci_dev *pci_dev_get(struct pci_dev *dev) +{ + if (dev) + get_device(&dev->dev); + return dev; +} + +/** + * pci_dev_put - release a use of the pci device structure + * @dev: device that's been disconnected + * + * Must be called when a user of a device is finished with it. When the last + * user of the device calls this function, the memory of the device is freed. + */ +void pci_dev_put(struct pci_dev *dev) +{ + if (dev) + put_device(&dev->dev); +} + +#ifndef CONFIG_HOTPLUG +int pci_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + return -ENODEV; +} +#endif + +struct bus_type pci_bus_type = { + .name = "pci", + .match = pci_bus_match, + .uevent = pci_uevent, + .probe = pci_device_probe, + .remove = pci_device_remove, + .shutdown = pci_device_shutdown, +#ifndef DDE_LINUX + .dev_attrs = pci_dev_attrs, +#endif + .pm = PCI_PM_OPS_PTR, +}; + +static int __init pci_driver_init(void) +{ + return bus_register(&pci_bus_type); +} + +postcore_initcall(pci_driver_init); + +EXPORT_SYMBOL(pci_match_id); +EXPORT_SYMBOL(__pci_register_driver); +EXPORT_SYMBOL(pci_unregister_driver); +EXPORT_SYMBOL(pci_dev_driver); +EXPORT_SYMBOL(pci_bus_type); +EXPORT_SYMBOL(pci_dev_get); +EXPORT_SYMBOL(pci_dev_put); diff --git a/libdde-linux26/lib/src/drivers/pci/pci.c b/libdde-linux26/lib/src/drivers/pci/pci.c new file mode 100644 index 00000000..9350ffed --- /dev/null +++ b/libdde-linux26/lib/src/drivers/pci/pci.c @@ -0,0 +1,2478 @@ +/* + * PCI Bus Services, see include/linux/pci.h for further explanation. + * + * Copyright 1993 -- 1997 Drew Eckhardt, Frederic Potter, + * David Mosberger-Tang + * + * Copyright 1997 -- 2000 Martin Mares <mj@ucw.cz> + */ + +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/pm.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/log2.h> +#include <linux/pci-aspm.h> +#include <linux/pm_wakeup.h> +#include <linux/interrupt.h> +#include <asm/dma.h> /* isa_dma_bridge_buggy */ +#include "pci.h" + +#ifdef DDE_LINUX +#include "local.h" +#endif + +unsigned int pci_pm_d3_delay = PCI_PM_D3_WAIT; + +#ifdef CONFIG_PCI_DOMAINS +int pci_domains_supported = 1; +#endif + +#define DEFAULT_CARDBUS_IO_SIZE (256) +#define DEFAULT_CARDBUS_MEM_SIZE (64*1024*1024) +/* pci=cbmemsize=nnM,cbiosize=nn can override this */ +unsigned long pci_cardbus_io_size = DEFAULT_CARDBUS_IO_SIZE; +unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE; + +/** + * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children + * @bus: pointer to PCI bus structure to search + * + * Given a PCI bus, returns the highest PCI bus number present in the set + * including the given PCI bus and its list of child PCI buses. + */ +unsigned char pci_bus_max_busnr(struct pci_bus* bus) +{ + struct list_head *tmp; + unsigned char max, n; + + max = bus->subordinate; + list_for_each(tmp, &bus->children) { + n = pci_bus_max_busnr(pci_bus_b(tmp)); + if(n > max) + max = n; + } + return max; +} +EXPORT_SYMBOL_GPL(pci_bus_max_busnr); + +#ifdef CONFIG_HAS_IOMEM +void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar) +{ + /* + * Make sure the BAR is actually a memory resource, not an IO resource + */ + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) { + WARN_ON(1); + return NULL; + } + return ioremap_nocache(pci_resource_start(pdev, bar), + pci_resource_len(pdev, bar)); +} +EXPORT_SYMBOL_GPL(pci_ioremap_bar); +#endif + +#if 0 +/** + * pci_max_busnr - returns maximum PCI bus number + * + * Returns the highest PCI bus number present in the system global list of + * PCI buses. + */ +unsigned char __devinit +pci_max_busnr(void) +{ + struct pci_bus *bus = NULL; + unsigned char max, n; + + max = 0; + while ((bus = pci_find_next_bus(bus)) != NULL) { + n = pci_bus_max_busnr(bus); + if(n > max) + max = n; + } + return max; +} + +#endif /* 0 */ + +#define PCI_FIND_CAP_TTL 48 + +static int __pci_find_next_cap_ttl(struct pci_bus *bus, unsigned int devfn, + u8 pos, int cap, int *ttl) +{ + u8 id; + + while ((*ttl)--) { + pci_bus_read_config_byte(bus, devfn, pos, &pos); + if (pos < 0x40) + break; + pos &= ~3; + pci_bus_read_config_byte(bus, devfn, pos + PCI_CAP_LIST_ID, + &id); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos += PCI_CAP_LIST_NEXT; + } + return 0; +} + +static int __pci_find_next_cap(struct pci_bus *bus, unsigned int devfn, + u8 pos, int cap) +{ + int ttl = PCI_FIND_CAP_TTL; + + return __pci_find_next_cap_ttl(bus, devfn, pos, cap, &ttl); +} + +int pci_find_next_capability(struct pci_dev *dev, u8 pos, int cap) +{ + return __pci_find_next_cap(dev->bus, dev->devfn, + pos + PCI_CAP_LIST_NEXT, cap); +} +EXPORT_SYMBOL_GPL(pci_find_next_capability); + +static int __pci_bus_find_cap_start(struct pci_bus *bus, + unsigned int devfn, u8 hdr_type) +{ + u16 status; + + pci_bus_read_config_word(bus, devfn, PCI_STATUS, &status); + if (!(status & PCI_STATUS_CAP_LIST)) + return 0; + + switch (hdr_type) { + case PCI_HEADER_TYPE_NORMAL: + case PCI_HEADER_TYPE_BRIDGE: + return PCI_CAPABILITY_LIST; + case PCI_HEADER_TYPE_CARDBUS: + return PCI_CB_CAPABILITY_LIST; + default: + return 0; + } + + return 0; +} + +/** + * pci_find_capability - query for devices' capabilities + * @dev: PCI device to query + * @cap: capability code + * + * Tell if a device supports a given PCI capability. + * Returns the address of the requested capability structure within the + * device's PCI configuration space or 0 in case the device does not + * support it. Possible values for @cap: + * + * %PCI_CAP_ID_PM Power Management + * %PCI_CAP_ID_AGP Accelerated Graphics Port + * %PCI_CAP_ID_VPD Vital Product Data + * %PCI_CAP_ID_SLOTID Slot Identification + * %PCI_CAP_ID_MSI Message Signalled Interrupts + * %PCI_CAP_ID_CHSWP CompactPCI HotSwap + * %PCI_CAP_ID_PCIX PCI-X + * %PCI_CAP_ID_EXP PCI Express + */ +int pci_find_capability(struct pci_dev *dev, int cap) +{ + int pos; + + pos = __pci_bus_find_cap_start(dev->bus, dev->devfn, dev->hdr_type); + if (pos) + pos = __pci_find_next_cap(dev->bus, dev->devfn, pos, cap); + + return pos; +} + +/** + * pci_bus_find_capability - query for devices' capabilities + * @bus: the PCI bus to query + * @devfn: PCI device to query + * @cap: capability code + * + * Like pci_find_capability() but works for pci devices that do not have a + * pci_dev structure set up yet. + * + * Returns the address of the requested capability structure within the + * device's PCI configuration space or 0 in case the device does not + * support it. + */ +int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap) +{ + int pos; + u8 hdr_type; + + pci_bus_read_config_byte(bus, devfn, PCI_HEADER_TYPE, &hdr_type); + + pos = __pci_bus_find_cap_start(bus, devfn, hdr_type & 0x7f); + if (pos) + pos = __pci_find_next_cap(bus, devfn, pos, cap); + + return pos; +} + +/** + * pci_find_ext_capability - Find an extended capability + * @dev: PCI device to query + * @cap: capability code + * + * Returns the address of the requested extended capability structure + * within the device's PCI configuration space or 0 if the device does + * not support it. Possible values for @cap: + * + * %PCI_EXT_CAP_ID_ERR Advanced Error Reporting + * %PCI_EXT_CAP_ID_VC Virtual Channel + * %PCI_EXT_CAP_ID_DSN Device Serial Number + * %PCI_EXT_CAP_ID_PWR Power Budgeting + */ +int pci_find_ext_capability(struct pci_dev *dev, int cap) +{ + u32 header; + int ttl; + int pos = PCI_CFG_SPACE_SIZE; + + /* minimum 8 bytes per capability */ + ttl = (PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) / 8; + + if (dev->cfg_size <= PCI_CFG_SPACE_SIZE) + return 0; + + if (pci_read_config_dword(dev, pos, &header) != PCIBIOS_SUCCESSFUL) + return 0; + + /* + * If we have no capabilities, this is indicated by cap ID, + * cap version and next pointer all being 0. + */ + if (header == 0) + return 0; + + while (ttl-- > 0) { + if (PCI_EXT_CAP_ID(header) == cap) + return pos; + + pos = PCI_EXT_CAP_NEXT(header); + if (pos < PCI_CFG_SPACE_SIZE) + break; + + if (pci_read_config_dword(dev, pos, &header) != PCIBIOS_SUCCESSFUL) + break; + } + + return 0; +} +EXPORT_SYMBOL_GPL(pci_find_ext_capability); + +static int __pci_find_next_ht_cap(struct pci_dev *dev, int pos, int ht_cap) +{ + int rc, ttl = PCI_FIND_CAP_TTL; + u8 cap, mask; + + if (ht_cap == HT_CAPTYPE_SLAVE || ht_cap == HT_CAPTYPE_HOST) + mask = HT_3BIT_CAP_MASK; + else + mask = HT_5BIT_CAP_MASK; + + pos = __pci_find_next_cap_ttl(dev->bus, dev->devfn, pos, + PCI_CAP_ID_HT, &ttl); + while (pos) { + rc = pci_read_config_byte(dev, pos + 3, &cap); + if (rc != PCIBIOS_SUCCESSFUL) + return 0; + + if ((cap & mask) == ht_cap) + return pos; + + pos = __pci_find_next_cap_ttl(dev->bus, dev->devfn, + pos + PCI_CAP_LIST_NEXT, + PCI_CAP_ID_HT, &ttl); + } + + return 0; +} +/** + * pci_find_next_ht_capability - query a device's Hypertransport capabilities + * @dev: PCI device to query + * @pos: Position from which to continue searching + * @ht_cap: Hypertransport capability code + * + * To be used in conjunction with pci_find_ht_capability() to search for + * all capabilities matching @ht_cap. @pos should always be a value returned + * from pci_find_ht_capability(). + * + * NB. To be 100% safe against broken PCI devices, the caller should take + * steps to avoid an infinite loop. + */ +int pci_find_next_ht_capability(struct pci_dev *dev, int pos, int ht_cap) +{ + return __pci_find_next_ht_cap(dev, pos + PCI_CAP_LIST_NEXT, ht_cap); +} +EXPORT_SYMBOL_GPL(pci_find_next_ht_capability); + +/** + * pci_find_ht_capability - query a device's Hypertransport capabilities + * @dev: PCI device to query + * @ht_cap: Hypertransport capability code + * + * Tell if a device supports a given Hypertransport capability. + * Returns an address within the device's PCI configuration space + * or 0 in case the device does not support the request capability. + * The address points to the PCI capability, of type PCI_CAP_ID_HT, + * which has a Hypertransport capability matching @ht_cap. + */ +int pci_find_ht_capability(struct pci_dev *dev, int ht_cap) +{ + int pos; + + pos = __pci_bus_find_cap_start(dev->bus, dev->devfn, dev->hdr_type); + if (pos) + pos = __pci_find_next_ht_cap(dev, pos, ht_cap); + + return pos; +} +EXPORT_SYMBOL_GPL(pci_find_ht_capability); + +/** + * pci_find_parent_resource - return resource region of parent bus of given region + * @dev: PCI device structure contains resources to be searched + * @res: child resource record for which parent is sought + * + * For given resource region of given device, return the resource + * region of parent bus the given region is contained in or where + * it should be allocated from. + */ +struct resource * +pci_find_parent_resource(const struct pci_dev *dev, struct resource *res) +{ + const struct pci_bus *bus = dev->bus; + int i; + struct resource *best = NULL; + + for(i = 0; i < PCI_BUS_NUM_RESOURCES; i++) { + struct resource *r = bus->resource[i]; + if (!r) + continue; + if (res->start && !(res->start >= r->start && res->end <= r->end)) + continue; /* Not contained */ + if ((res->flags ^ r->flags) & (IORESOURCE_IO | IORESOURCE_MEM)) + continue; /* Wrong type */ + if (!((res->flags ^ r->flags) & IORESOURCE_PREFETCH)) + return r; /* Exact match */ + if ((res->flags & IORESOURCE_PREFETCH) && !(r->flags & IORESOURCE_PREFETCH)) + best = r; /* Approximating prefetchable by non-prefetchable */ + } + return best; +} + +/** + * pci_restore_bars - restore a devices BAR values (e.g. after wake-up) + * @dev: PCI device to have its BARs restored + * + * Restore the BAR values for a given device, so as to make it + * accessible by its driver. + */ +static void +pci_restore_bars(struct pci_dev *dev) +{ + int i; + + for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) + pci_update_resource(dev, i); +} + +static struct pci_platform_pm_ops *pci_platform_pm; + +int pci_set_platform_pm(struct pci_platform_pm_ops *ops) +{ + if (!ops->is_manageable || !ops->set_state || !ops->choose_state + || !ops->sleep_wake || !ops->can_wakeup) + return -EINVAL; + pci_platform_pm = ops; + return 0; +} + +static inline bool platform_pci_power_manageable(struct pci_dev *dev) +{ + return pci_platform_pm ? pci_platform_pm->is_manageable(dev) : false; +} + +static inline int platform_pci_set_power_state(struct pci_dev *dev, + pci_power_t t) +{ + return pci_platform_pm ? pci_platform_pm->set_state(dev, t) : -ENOSYS; +} + +static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev) +{ + return pci_platform_pm ? + pci_platform_pm->choose_state(dev) : PCI_POWER_ERROR; +} + +static inline bool platform_pci_can_wakeup(struct pci_dev *dev) +{ + return pci_platform_pm ? pci_platform_pm->can_wakeup(dev) : false; +} + +static inline int platform_pci_sleep_wake(struct pci_dev *dev, bool enable) +{ + return pci_platform_pm ? + pci_platform_pm->sleep_wake(dev, enable) : -ENODEV; +} + +/** + * pci_raw_set_power_state - Use PCI PM registers to set the power state of + * given PCI device + * @dev: PCI device to handle. + * @state: PCI power state (D0, D1, D2, D3hot) to put the device into. + * @wait: If 'true', wait for the device to change its power state + * + * RETURN VALUE: + * -EINVAL if the requested state is invalid. + * -EIO if device does not support PCI PM or its PM capabilities register has a + * wrong version, or device doesn't support the requested state. + * 0 if device already is in the requested state. + * 0 if device's power state has been successfully changed. + */ +static int +pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state, bool wait) +{ + u16 pmcsr; + bool need_restore = false; + + if (!dev->pm_cap) + return -EIO; + + if (state < PCI_D0 || state > PCI_D3hot) + return -EINVAL; + + /* Validate current state: + * Can enter D0 from any state, but if we can only go deeper + * to sleep if we're already in a low power state + */ + if (dev->current_state == state) { + /* we're already there */ + return 0; + } else if (state != PCI_D0 && dev->current_state <= PCI_D3cold + && dev->current_state > state) { + dev_err(&dev->dev, "invalid power transition " + "(from state %d to %d)\n", dev->current_state, state); + return -EINVAL; + } + + /* check if this device supports the desired state */ + if ((state == PCI_D1 && !dev->d1_support) + || (state == PCI_D2 && !dev->d2_support)) + return -EIO; + + pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); + + /* If we're (effectively) in D3, force entire word to 0. + * This doesn't affect PME_Status, disables PME_En, and + * sets PowerState to 0. + */ + switch (dev->current_state) { + case PCI_D0: + case PCI_D1: + case PCI_D2: + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + pmcsr |= state; + break; + case PCI_UNKNOWN: /* Boot-up */ + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) == PCI_D3hot + && !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET)) { + need_restore = true; + wait = true; + } + /* Fall-through: force to D0 */ + default: + pmcsr = 0; + break; + } + + /* enter specified state */ + pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr); + + if (!wait) + return 0; + + /* Mandatory power management transition delays */ + /* see PCI PM 1.1 5.6.1 table 18 */ + if (state == PCI_D3hot || dev->current_state == PCI_D3hot) + msleep(pci_pm_d3_delay); + else if (state == PCI_D2 || dev->current_state == PCI_D2) + udelay(PCI_PM_D2_DELAY); + + dev->current_state = state; + + /* According to section 5.4.1 of the "PCI BUS POWER MANAGEMENT + * INTERFACE SPECIFICATION, REV. 1.2", a device transitioning + * from D3hot to D0 _may_ perform an internal reset, thereby + * going to "D0 Uninitialized" rather than "D0 Initialized". + * For example, at least some versions of the 3c905B and the + * 3c556B exhibit this behaviour. + * + * At least some laptop BIOSen (e.g. the Thinkpad T21) leave + * devices in a D3hot state at boot. Consequently, we need to + * restore at least the BARs so that the device will be + * accessible to its driver. + */ + if (need_restore) + pci_restore_bars(dev); + + if (wait && dev->bus->self) + pcie_aspm_pm_state_change(dev->bus->self); + + return 0; +} + +/** + * pci_update_current_state - Read PCI power state of given device from its + * PCI PM registers and cache it + * @dev: PCI device to handle. + * @state: State to cache in case the device doesn't have the PM capability + */ +void pci_update_current_state(struct pci_dev *dev, pci_power_t state) +{ + if (dev->pm_cap) { + u16 pmcsr; + + pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); + dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); + } else { + dev->current_state = state; + } +} + +/** + * pci_set_power_state - Set the power state of a PCI device + * @dev: PCI device to handle. + * @state: PCI power state (D0, D1, D2, D3hot) to put the device into. + * + * Transition a device to a new power state, using the platform formware and/or + * the device's PCI PM registers. + * + * RETURN VALUE: + * -EINVAL if the requested state is invalid. + * -EIO if device does not support PCI PM or its PM capabilities register has a + * wrong version, or device doesn't support the requested state. + * 0 if device already is in the requested state. + * 0 if device's power state has been successfully changed. + */ +int pci_set_power_state(struct pci_dev *dev, pci_power_t state) +{ + int error; + + /* bound the state we're entering */ + if (state > PCI_D3hot) + state = PCI_D3hot; + else if (state < PCI_D0) + state = PCI_D0; + else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev)) + /* + * If the device or the parent bridge do not support PCI PM, + * ignore the request if we're doing anything other than putting + * it into D0 (which would only happen on boot). + */ + return 0; + + if (state == PCI_D0 && platform_pci_power_manageable(dev)) { + /* + * Allow the platform to change the state, for example via ACPI + * _PR0, _PS0 and some such, but do not trust it. + */ + int ret = platform_pci_set_power_state(dev, PCI_D0); + if (!ret) + pci_update_current_state(dev, PCI_D0); + } + /* This device is quirked not to be put into D3, so + don't put it in D3 */ + if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3)) + return 0; + + error = pci_raw_set_power_state(dev, state, true); + + if (state > PCI_D0 && platform_pci_power_manageable(dev)) { + /* Allow the platform to finalize the transition */ + int ret = platform_pci_set_power_state(dev, state); + if (!ret) { + pci_update_current_state(dev, state); + error = 0; + } + } + + return error; +} + +/** + * pci_choose_state - Choose the power state of a PCI device + * @dev: PCI device to be suspended + * @state: target sleep state for the whole system. This is the value + * that is passed to suspend() function. + * + * Returns PCI power state suitable for given device and given system + * message. + */ + +pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) +{ + pci_power_t ret; + + if (!pci_find_capability(dev, PCI_CAP_ID_PM)) + return PCI_D0; + + ret = platform_pci_choose_state(dev); + if (ret != PCI_POWER_ERROR) + return ret; + + switch (state.event) { + case PM_EVENT_ON: + return PCI_D0; + case PM_EVENT_FREEZE: + case PM_EVENT_PRETHAW: + /* REVISIT both freeze and pre-thaw "should" use D0 */ + case PM_EVENT_SUSPEND: + case PM_EVENT_HIBERNATE: + return PCI_D3hot; + default: + dev_info(&dev->dev, "unrecognized suspend event %d\n", + state.event); + BUG(); + } + return PCI_D0; +} + +EXPORT_SYMBOL(pci_choose_state); + +static int pci_save_pcie_state(struct pci_dev *dev) +{ + int pos, i = 0; + struct pci_cap_saved_state *save_state; + u16 *cap; + + pos = pci_find_capability(dev, PCI_CAP_ID_EXP); + if (pos <= 0) + return 0; + + save_state = pci_find_saved_cap(dev, PCI_CAP_ID_EXP); + if (!save_state) { + dev_err(&dev->dev, "buffer not found in %s\n", __FUNCTION__); + return -ENOMEM; + } + cap = (u16 *)&save_state->data[0]; + + pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &cap[i++]); + pci_read_config_word(dev, pos + PCI_EXP_LNKCTL, &cap[i++]); + pci_read_config_word(dev, pos + PCI_EXP_SLTCTL, &cap[i++]); + pci_read_config_word(dev, pos + PCI_EXP_RTCTL, &cap[i++]); + + return 0; +} + +static void pci_restore_pcie_state(struct pci_dev *dev) +{ + int i = 0, pos; + struct pci_cap_saved_state *save_state; + u16 *cap; + + save_state = pci_find_saved_cap(dev, PCI_CAP_ID_EXP); + pos = pci_find_capability(dev, PCI_CAP_ID_EXP); + if (!save_state || pos <= 0) + return; + cap = (u16 *)&save_state->data[0]; + + pci_write_config_word(dev, pos + PCI_EXP_DEVCTL, cap[i++]); + pci_write_config_word(dev, pos + PCI_EXP_LNKCTL, cap[i++]); + pci_write_config_word(dev, pos + PCI_EXP_SLTCTL, cap[i++]); + pci_write_config_word(dev, pos + PCI_EXP_RTCTL, cap[i++]); +} + + +static int pci_save_pcix_state(struct pci_dev *dev) +{ + int pos; + struct pci_cap_saved_state *save_state; + + pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (pos <= 0) + return 0; + + save_state = pci_find_saved_cap(dev, PCI_CAP_ID_PCIX); + if (!save_state) { + dev_err(&dev->dev, "buffer not found in %s\n", __FUNCTION__); + return -ENOMEM; + } + + pci_read_config_word(dev, pos + PCI_X_CMD, (u16 *)save_state->data); + + return 0; +} + +static void pci_restore_pcix_state(struct pci_dev *dev) +{ + int i = 0, pos; + struct pci_cap_saved_state *save_state; + u16 *cap; + + save_state = pci_find_saved_cap(dev, PCI_CAP_ID_PCIX); + pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (!save_state || pos <= 0) + return; + cap = (u16 *)&save_state->data[0]; + + pci_write_config_word(dev, pos + PCI_X_CMD, cap[i++]); +} + + +/** + * pci_save_state - save the PCI configuration space of a device before suspending + * @dev: - PCI device that we're dealing with + */ +int +pci_save_state(struct pci_dev *dev) +{ + int i; + /* XXX: 100% dword access ok here? */ + for (i = 0; i < 16; i++) + pci_read_config_dword(dev, i * 4,&dev->saved_config_space[i]); + dev->state_saved = true; + if ((i = pci_save_pcie_state(dev)) != 0) + return i; + if ((i = pci_save_pcix_state(dev)) != 0) + return i; + return 0; +} + +/** + * pci_restore_state - Restore the saved state of a PCI device + * @dev: - PCI device that we're dealing with + */ +int +pci_restore_state(struct pci_dev *dev) +{ + int i; + u32 val; + + /* PCI Express register must be restored first */ + pci_restore_pcie_state(dev); + + /* + * The Base Address register should be programmed before the command + * register(s) + */ + for (i = 15; i >= 0; i--) { + pci_read_config_dword(dev, i * 4, &val); + if (val != dev->saved_config_space[i]) { + dev_printk(KERN_DEBUG, &dev->dev, "restoring config " + "space at offset %#x (was %#x, writing %#x)\n", + i, val, (int)dev->saved_config_space[i]); + pci_write_config_dword(dev,i * 4, + dev->saved_config_space[i]); + } + } + pci_restore_pcix_state(dev); + pci_restore_msi_state(dev); + + return 0; +} + +static int do_pci_enable_device(struct pci_dev *dev, int bars) +{ + int err; + + err = pci_set_power_state(dev, PCI_D0); + if (err < 0 && err != -EIO) + return err; + err = pcibios_enable_device(dev, bars); + if (err < 0) + return err; + pci_fixup_device(pci_fixup_enable, dev); + + return 0; +} + +/** + * pci_reenable_device - Resume abandoned device + * @dev: PCI device to be resumed + * + * Note this function is a backend of pci_default_resume and is not supposed + * to be called by normal code, write proper resume handler and use it instead. + */ +int pci_reenable_device(struct pci_dev *dev) +{ + if (atomic_read(&dev->enable_cnt)) + return do_pci_enable_device(dev, (1 << PCI_NUM_RESOURCES) - 1); + return 0; +} + +static int __pci_enable_device_flags(struct pci_dev *dev, + resource_size_t flags) +{ + int err; + int i, bars = 0; + + if (atomic_add_return(1, &dev->enable_cnt) > 1) + return 0; /* already enabled */ + + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) + if (dev->resource[i].flags & flags) + bars |= (1 << i); + + err = do_pci_enable_device(dev, bars); + if (err < 0) + atomic_dec(&dev->enable_cnt); + return err; +} + +/** + * pci_enable_device_io - Initialize a device for use with IO space + * @dev: PCI device to be initialized + * + * Initialize device before it's used by a driver. Ask low-level code + * to enable I/O resources. Wake up the device if it was suspended. + * Beware, this function can fail. + */ +int pci_enable_device_io(struct pci_dev *dev) +{ + return __pci_enable_device_flags(dev, IORESOURCE_IO); +} + +/** + * pci_enable_device_mem - Initialize a device for use with Memory space + * @dev: PCI device to be initialized + * + * Initialize device before it's used by a driver. Ask low-level code + * to enable Memory resources. Wake up the device if it was suspended. + * Beware, this function can fail. + */ +int pci_enable_device_mem(struct pci_dev *dev) +{ + return __pci_enable_device_flags(dev, IORESOURCE_MEM); +} + +/** pci_enable_device() is implemented by the DDE. */ +#ifndef DDE_LINUX +/** + * pci_enable_device - Initialize device before it's used by a driver. + * @dev: PCI device to be initialized + * + * Initialize device before it's used by a driver. Ask low-level code + * to enable I/O and memory. Wake up the device if it was suspended. + * Beware, this function can fail. + * + * Note we don't actually enable the device many times if we call + * this function repeatedly (we just increment the count). + */ +int pci_enable_device(struct pci_dev *dev) +{ + return __pci_enable_device_flags(dev, IORESOURCE_MEM | IORESOURCE_IO); +} +#endif + +/* + * Managed PCI resources. This manages device on/off, intx/msi/msix + * on/off and BAR regions. pci_dev itself records msi/msix status, so + * there's no need to track it separately. pci_devres is initialized + * when a device is enabled using managed PCI device enable interface. + */ +struct pci_devres { + unsigned int enabled:1; + unsigned int pinned:1; + unsigned int orig_intx:1; + unsigned int restore_intx:1; + u32 region_mask; +}; + +static void pcim_release(struct device *gendev, void *res) +{ + struct pci_dev *dev = container_of(gendev, struct pci_dev, dev); + struct pci_devres *this = res; + int i; + + if (dev->msi_enabled) + pci_disable_msi(dev); + if (dev->msix_enabled) + pci_disable_msix(dev); + + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) + if (this->region_mask & (1 << i)) + pci_release_region(dev, i); + + if (this->restore_intx) + pci_intx(dev, this->orig_intx); + + if (this->enabled && !this->pinned) + pci_disable_device(dev); +} + +static struct pci_devres * get_pci_dr(struct pci_dev *pdev) +{ + struct pci_devres *dr, *new_dr; + + dr = devres_find(&pdev->dev, pcim_release, NULL, NULL); + if (dr) + return dr; + + new_dr = devres_alloc(pcim_release, sizeof(*new_dr), GFP_KERNEL); + if (!new_dr) + return NULL; + return devres_get(&pdev->dev, new_dr, NULL, NULL); +} + +static struct pci_devres * find_pci_dr(struct pci_dev *pdev) +{ + if (pci_is_managed(pdev)) + return devres_find(&pdev->dev, pcim_release, NULL, NULL); + return NULL; +} + +/** + * pcim_enable_device - Managed pci_enable_device() + * @pdev: PCI device to be initialized + * + * Managed pci_enable_device(). + */ +int pcim_enable_device(struct pci_dev *pdev) +{ + struct pci_devres *dr; + int rc; + + dr = get_pci_dr(pdev); + if (unlikely(!dr)) + return -ENOMEM; + if (dr->enabled) + return 0; + + rc = pci_enable_device(pdev); + if (!rc) { + pdev->is_managed = 1; + dr->enabled = 1; + } + return rc; +} + +/** + * pcim_pin_device - Pin managed PCI device + * @pdev: PCI device to pin + * + * Pin managed PCI device @pdev. Pinned device won't be disabled on + * driver detach. @pdev must have been enabled with + * pcim_enable_device(). + */ +void pcim_pin_device(struct pci_dev *pdev) +{ + struct pci_devres *dr; + + dr = find_pci_dr(pdev); + WARN_ON(!dr || !dr->enabled); + if (dr) + dr->pinned = 1; +} + +#ifndef DDE_LINUX +/** + * pcibios_disable_device - disable arch specific PCI resources for device dev + * @dev: the PCI device to disable + * + * Disables architecture specific PCI resources for the device. This + * is the default implementation. Architecture implementations can + * override this. + */ +void __attribute__ ((weak)) pcibios_disable_device (struct pci_dev *dev) {} + +static void do_pci_disable_device(struct pci_dev *dev) +{ + u16 pci_command; + + pci_read_config_word(dev, PCI_COMMAND, &pci_command); + if (pci_command & PCI_COMMAND_MASTER) { + pci_command &= ~PCI_COMMAND_MASTER; + pci_write_config_word(dev, PCI_COMMAND, pci_command); + } + + pcibios_disable_device(dev); +} + +/** + * pci_disable_enabled_device - Disable device without updating enable_cnt + * @dev: PCI device to disable + * + * NOTE: This function is a backend of PCI power management routines and is + * not supposed to be called drivers. + */ +void pci_disable_enabled_device(struct pci_dev *dev) +{ + if (atomic_read(&dev->enable_cnt)) + do_pci_disable_device(dev); +} + +/** + * pci_disable_device - Disable PCI device after use + * @dev: PCI device to be disabled + * + * Signal to the system that the PCI device is not in use by the system + * anymore. This only involves disabling PCI bus-mastering, if active. + * + * Note we don't actually disable the device until all callers of + * pci_device_enable() have called pci_device_disable(). + */ +void +pci_disable_device(struct pci_dev *dev) +{ + struct pci_devres *dr; + + dr = find_pci_dr(dev); + if (dr) + dr->enabled = 0; + + if (atomic_sub_return(1, &dev->enable_cnt) != 0) + return; + + do_pci_disable_device(dev); + + dev->is_busmaster = 0; +} + +/** + * pcibios_set_pcie_reset_state - set reset state for device dev + * @dev: the PCI-E device reset + * @state: Reset state to enter into + * + * + * Sets the PCI-E reset state for the device. This is the default + * implementation. Architecture implementations can override this. + */ +int __attribute__ ((weak)) pcibios_set_pcie_reset_state(struct pci_dev *dev, + enum pcie_reset_state state) +{ + return -EINVAL; +} +#endif + +/** + * pci_set_pcie_reset_state - set reset state for device dev + * @dev: the PCI-E device reset + * @state: Reset state to enter into + * + * + * Sets the PCI reset state for the device. + */ +int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) +{ + return pcibios_set_pcie_reset_state(dev, state); +} + +/** + * pci_pme_capable - check the capability of PCI device to generate PME# + * @dev: PCI device to handle. + * @state: PCI state from which device will issue PME#. + */ +bool pci_pme_capable(struct pci_dev *dev, pci_power_t state) +{ + if (!dev->pm_cap) + return false; + + return !!(dev->pme_support & (1 << state)); +} + +/** + * pci_pme_active - enable or disable PCI device's PME# function + * @dev: PCI device to handle. + * @enable: 'true' to enable PME# generation; 'false' to disable it. + * + * The caller must verify that the device is capable of generating PME# before + * calling this function with @enable equal to 'true'. + */ +void pci_pme_active(struct pci_dev *dev, bool enable) +{ + u16 pmcsr; + + if (!dev->pm_cap) + return; + + pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); + /* Clear PME_Status by writing 1 to it and enable PME# */ + pmcsr |= PCI_PM_CTRL_PME_STATUS | PCI_PM_CTRL_PME_ENABLE; + if (!enable) + pmcsr &= ~PCI_PM_CTRL_PME_ENABLE; + + pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr); + + dev_printk(KERN_INFO, &dev->dev, "PME# %s\n", + enable ? "enabled" : "disabled"); +} + +/** + * pci_enable_wake - enable PCI device as wakeup event source + * @dev: PCI device affected + * @state: PCI state from which device will issue wakeup events + * @enable: True to enable event generation; false to disable + * + * This enables the device as a wakeup event source, or disables it. + * When such events involves platform-specific hooks, those hooks are + * called automatically by this routine. + * + * Devices with legacy power management (no standard PCI PM capabilities) + * always require such platform hooks. + * + * RETURN VALUE: + * 0 is returned on success + * -EINVAL is returned if device is not supposed to wake up the system + * Error code depending on the platform is returned if both the platform and + * the native mechanism fail to enable the generation of wake-up events + */ +int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable) +{ + int error = 0; + bool pme_done = false; + + if (enable && !device_may_wakeup(&dev->dev)) + return -EINVAL; + + /* + * According to "PCI System Architecture" 4th ed. by Tom Shanley & Don + * Anderson we should be doing PME# wake enable followed by ACPI wake + * enable. To disable wake-up we call the platform first, for symmetry. + */ + + if (!enable && platform_pci_can_wakeup(dev)) + error = platform_pci_sleep_wake(dev, false); + + if (!enable || pci_pme_capable(dev, state)) { + pci_pme_active(dev, enable); + pme_done = true; + } + + if (enable && platform_pci_can_wakeup(dev)) + error = platform_pci_sleep_wake(dev, true); + + return pme_done ? 0 : error; +} + +/** + * pci_wake_from_d3 - enable/disable device to wake up from D3_hot or D3_cold + * @dev: PCI device to prepare + * @enable: True to enable wake-up event generation; false to disable + * + * Many drivers want the device to wake up the system from D3_hot or D3_cold + * and this function allows them to set that up cleanly - pci_enable_wake() + * should not be called twice in a row to enable wake-up due to PCI PM vs ACPI + * ordering constraints. + * + * This function only returns error code if the device is not capable of + * generating PME# from both D3_hot and D3_cold, and the platform is unable to + * enable wake-up power for it. + */ +int pci_wake_from_d3(struct pci_dev *dev, bool enable) +{ + return pci_pme_capable(dev, PCI_D3cold) ? + pci_enable_wake(dev, PCI_D3cold, enable) : + pci_enable_wake(dev, PCI_D3hot, enable); +} + +/** + * pci_target_state - find an appropriate low power state for a given PCI dev + * @dev: PCI device + * + * Use underlying platform code to find a supported low power state for @dev. + * If the platform can't manage @dev, return the deepest state from which it + * can generate wake events, based on any available PME info. + */ +pci_power_t pci_target_state(struct pci_dev *dev) +{ + pci_power_t target_state = PCI_D3hot; + + if (platform_pci_power_manageable(dev)) { + /* + * Call the platform to choose the target state of the device + * and enable wake-up from this state if supported. + */ + pci_power_t state = platform_pci_choose_state(dev); + + switch (state) { + case PCI_POWER_ERROR: + case PCI_UNKNOWN: + break; + case PCI_D1: + case PCI_D2: + if (pci_no_d1d2(dev)) + break; + default: + target_state = state; + } + } else if (device_may_wakeup(&dev->dev)) { + /* + * Find the deepest state from which the device can generate + * wake-up events, make it the target state and enable device + * to generate PME#. + */ + if (!dev->pm_cap) + return PCI_POWER_ERROR; + + if (dev->pme_support) { + while (target_state + && !(dev->pme_support & (1 << target_state))) + target_state--; + } + } + + return target_state; +} + +/** + * pci_prepare_to_sleep - prepare PCI device for system-wide transition into a sleep state + * @dev: Device to handle. + * + * Choose the power state appropriate for the device depending on whether + * it can wake up the system and/or is power manageable by the platform + * (PCI_D3hot is the default) and put the device into that state. + */ +int pci_prepare_to_sleep(struct pci_dev *dev) +{ + pci_power_t target_state = pci_target_state(dev); + int error; + + if (target_state == PCI_POWER_ERROR) + return -EIO; + + pci_enable_wake(dev, target_state, true); + + error = pci_set_power_state(dev, target_state); + + if (error) + pci_enable_wake(dev, target_state, false); + + return error; +} + +/** + * pci_back_from_sleep - turn PCI device on during system-wide transition into working state + * @dev: Device to handle. + * + * Disable device's sytem wake-up capability and put it into D0. + */ +int pci_back_from_sleep(struct pci_dev *dev) +{ + pci_enable_wake(dev, PCI_D0, false); + return pci_set_power_state(dev, PCI_D0); +} + +/** + * pci_pm_init - Initialize PM functions of given PCI device + * @dev: PCI device to handle. + */ +void pci_pm_init(struct pci_dev *dev) +{ + int pm; + u16 pmc; + + dev->pm_cap = 0; + + /* find PCI PM capability in list */ + pm = pci_find_capability(dev, PCI_CAP_ID_PM); + if (!pm) + return; + /* Check device's ability to generate PME# */ + pci_read_config_word(dev, pm + PCI_PM_PMC, &pmc); + + if ((pmc & PCI_PM_CAP_VER_MASK) > 3) { + dev_err(&dev->dev, "unsupported PM cap regs version (%u)\n", + pmc & PCI_PM_CAP_VER_MASK); + return; + } + + dev->pm_cap = pm; + + dev->d1_support = false; + dev->d2_support = false; + if (!pci_no_d1d2(dev)) { + if (pmc & PCI_PM_CAP_D1) + dev->d1_support = true; + if (pmc & PCI_PM_CAP_D2) + dev->d2_support = true; + + if (dev->d1_support || dev->d2_support) + dev_printk(KERN_DEBUG, &dev->dev, "supports%s%s\n", + dev->d1_support ? " D1" : "", + dev->d2_support ? " D2" : ""); + } + + pmc &= PCI_PM_CAP_PME_MASK; + if (pmc) { + dev_info(&dev->dev, "PME# supported from%s%s%s%s%s\n", + (pmc & PCI_PM_CAP_PME_D0) ? " D0" : "", + (pmc & PCI_PM_CAP_PME_D1) ? " D1" : "", + (pmc & PCI_PM_CAP_PME_D2) ? " D2" : "", + (pmc & PCI_PM_CAP_PME_D3) ? " D3hot" : "", + (pmc & PCI_PM_CAP_PME_D3cold) ? " D3cold" : ""); + dev->pme_support = pmc >> PCI_PM_CAP_PME_SHIFT; + /* + * Make device's PM flags reflect the wake-up capability, but + * let the user space enable it to wake up the system as needed. + */ + device_set_wakeup_capable(&dev->dev, true); + device_set_wakeup_enable(&dev->dev, false); + /* Disable the PME# generation functionality */ + pci_pme_active(dev, false); + } else { + dev->pme_support = 0; + } +} + +/** + * platform_pci_wakeup_init - init platform wakeup if present + * @dev: PCI device + * + * Some devices don't have PCI PM caps but can still generate wakeup + * events through platform methods (like ACPI events). If @dev supports + * platform wakeup events, set the device flag to indicate as much. This + * may be redundant if the device also supports PCI PM caps, but double + * initialization should be safe in that case. + */ +void platform_pci_wakeup_init(struct pci_dev *dev) +{ + if (!platform_pci_can_wakeup(dev)) + return; + + device_set_wakeup_capable(&dev->dev, true); + device_set_wakeup_enable(&dev->dev, false); + platform_pci_sleep_wake(dev, false); +} + + +/** + * pci_add_save_buffer - allocate buffer for saving given capability registers + * @dev: the PCI device + * @cap: the capability to allocate the buffer for + * @size: requested size of the buffer + */ +static int pci_add_cap_save_buffer( + struct pci_dev *dev, char cap, unsigned int size) +{ + int pos; + struct pci_cap_saved_state *save_state; + + pos = pci_find_capability(dev, cap); + if (pos <= 0) + return 0; + + save_state = kzalloc(sizeof(*save_state) + size, GFP_KERNEL); + if (!save_state) + return -ENOMEM; + + save_state->cap_nr = cap; + pci_add_saved_cap(dev, save_state); + + return 0; +} + +/** + * pci_allocate_cap_save_buffers - allocate buffers for saving capabilities + * @dev: the PCI device + */ +void pci_allocate_cap_save_buffers(struct pci_dev *dev) +{ + int error; + + error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_EXP, 4 * sizeof(u16)); + if (error) + dev_err(&dev->dev, + "unable to preallocate PCI Express save buffer\n"); + + error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_PCIX, sizeof(u16)); + if (error) + dev_err(&dev->dev, + "unable to preallocate PCI-X save buffer\n"); +} + +/** + * pci_restore_standard_config - restore standard config registers of PCI device + * @dev: PCI device to handle + * + * This function assumes that the device's configuration space is accessible. + * If the device needs to be powered up, the function will wait for it to + * change the state. + */ +int pci_restore_standard_config(struct pci_dev *dev) +{ + pci_power_t prev_state; + int error; + + pci_update_current_state(dev, PCI_D0); + + prev_state = dev->current_state; + if (prev_state == PCI_D0) + goto Restore; + + error = pci_raw_set_power_state(dev, PCI_D0, false); + if (error) + return error; + + /* + * This assumes that we won't get a bus in B2 or B3 from the BIOS, but + * we've made this assumption forever and it appears to be universally + * satisfied. + */ + switch(prev_state) { + case PCI_D3cold: + case PCI_D3hot: + mdelay(pci_pm_d3_delay); + break; + case PCI_D2: + udelay(PCI_PM_D2_DELAY); + break; + } + + pci_update_current_state(dev, PCI_D0); + + Restore: + return dev->state_saved ? pci_restore_state(dev) : 0; +} + +/** + * pci_enable_ari - enable ARI forwarding if hardware support it + * @dev: the PCI device + */ +void pci_enable_ari(struct pci_dev *dev) +{ + int pos; + u32 cap; + u16 ctrl; + struct pci_dev *bridge; + + if (!dev->is_pcie || dev->devfn) + return; + + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ARI); + if (!pos) + return; + + bridge = dev->bus->self; + if (!bridge || !bridge->is_pcie) + return; + + pos = pci_find_capability(bridge, PCI_CAP_ID_EXP); + if (!pos) + return; + + pci_read_config_dword(bridge, pos + PCI_EXP_DEVCAP2, &cap); + if (!(cap & PCI_EXP_DEVCAP2_ARI)) + return; + + pci_read_config_word(bridge, pos + PCI_EXP_DEVCTL2, &ctrl); + ctrl |= PCI_EXP_DEVCTL2_ARI; + pci_write_config_word(bridge, pos + PCI_EXP_DEVCTL2, ctrl); + + bridge->ari_enabled = 1; +} + +/** + * pci_swizzle_interrupt_pin - swizzle INTx for device behind bridge + * @dev: the PCI device + * @pin: the INTx pin (1=INTA, 2=INTB, 3=INTD, 4=INTD) + * + * Perform INTx swizzling for a device behind one level of bridge. This is + * required by section 9.1 of the PCI-to-PCI bridge specification for devices + * behind bridges on add-in cards. + */ +u8 pci_swizzle_interrupt_pin(struct pci_dev *dev, u8 pin) +{ + return (((pin - 1) + PCI_SLOT(dev->devfn)) % 4) + 1; +} + +int +pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge) +{ + u8 pin; + + pin = dev->pin; + if (!pin) + return -1; + + while (dev->bus->self) { + pin = pci_swizzle_interrupt_pin(dev, pin); + dev = dev->bus->self; + } + *bridge = dev; + return pin; +} + +/** + * pci_common_swizzle - swizzle INTx all the way to root bridge + * @dev: the PCI device + * @pinp: pointer to the INTx pin value (1=INTA, 2=INTB, 3=INTD, 4=INTD) + * + * Perform INTx swizzling for a device. This traverses through all PCI-to-PCI + * bridges all the way up to a PCI root bus. + */ +u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp) +{ + u8 pin = *pinp; + + while (dev->bus->self) { + pin = pci_swizzle_interrupt_pin(dev, pin); + dev = dev->bus->self; + } + *pinp = pin; + return PCI_SLOT(dev->devfn); +} + +/** + * pci_release_region - Release a PCI bar + * @pdev: PCI device whose resources were previously reserved by pci_request_region + * @bar: BAR to release + * + * Releases the PCI I/O and memory resources previously reserved by a + * successful call to pci_request_region. Call this function only + * after all use of the PCI regions has ceased. + */ +void pci_release_region(struct pci_dev *pdev, int bar) +{ + struct pci_devres *dr; + + if (pci_resource_len(pdev, bar) == 0) + return; + if (pci_resource_flags(pdev, bar) & IORESOURCE_IO) + release_region(pci_resource_start(pdev, bar), + pci_resource_len(pdev, bar)); + else if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) + release_mem_region(pci_resource_start(pdev, bar), + pci_resource_len(pdev, bar)); + + dr = find_pci_dr(pdev); + if (dr) + dr->region_mask &= ~(1 << bar); +} + +/** + * __pci_request_region - Reserved PCI I/O and memory resource + * @pdev: PCI device whose resources are to be reserved + * @bar: BAR to be reserved + * @res_name: Name to be associated with resource. + * @exclusive: whether the region access is exclusive or not + * + * Mark the PCI region associated with PCI device @pdev BR @bar as + * being reserved by owner @res_name. Do not access any + * address inside the PCI regions unless this call returns + * successfully. + * + * If @exclusive is set, then the region is marked so that userspace + * is explicitly not allowed to map the resource via /dev/mem or + * sysfs MMIO access. + * + * Returns 0 on success, or %EBUSY on error. A warning + * message is also printed on failure. + */ +static int __pci_request_region(struct pci_dev *pdev, int bar, const char *res_name, + int exclusive) +{ + struct pci_devres *dr; + + if (pci_resource_len(pdev, bar) == 0) + return 0; + + if (pci_resource_flags(pdev, bar) & IORESOURCE_IO) { + if (!request_region(pci_resource_start(pdev, bar), + pci_resource_len(pdev, bar), res_name)) + goto err_out; + } + else if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) { + if (!__request_mem_region(pci_resource_start(pdev, bar), + pci_resource_len(pdev, bar), res_name, + exclusive)) + goto err_out; + } + + dr = find_pci_dr(pdev); + if (dr) + dr->region_mask |= 1 << bar; + + return 0; + +err_out: + dev_warn(&pdev->dev, "BAR %d: can't reserve %s region %pR\n", + bar, + pci_resource_flags(pdev, bar) & IORESOURCE_IO ? "I/O" : "mem", + &pdev->resource[bar]); + return -EBUSY; +} + +/** + * pci_request_region - Reserve PCI I/O and memory resource + * @pdev: PCI device whose resources are to be reserved + * @bar: BAR to be reserved + * @res_name: Name to be associated with resource + * + * Mark the PCI region associated with PCI device @pdev BAR @bar as + * being reserved by owner @res_name. Do not access any + * address inside the PCI regions unless this call returns + * successfully. + * + * Returns 0 on success, or %EBUSY on error. A warning + * message is also printed on failure. + */ +int pci_request_region(struct pci_dev *pdev, int bar, const char *res_name) +{ + return __pci_request_region(pdev, bar, res_name, 0); +} + +/** + * pci_request_region_exclusive - Reserved PCI I/O and memory resource + * @pdev: PCI device whose resources are to be reserved + * @bar: BAR to be reserved + * @res_name: Name to be associated with resource. + * + * Mark the PCI region associated with PCI device @pdev BR @bar as + * being reserved by owner @res_name. Do not access any + * address inside the PCI regions unless this call returns + * successfully. + * + * Returns 0 on success, or %EBUSY on error. A warning + * message is also printed on failure. + * + * The key difference that _exclusive makes it that userspace is + * explicitly not allowed to map the resource via /dev/mem or + * sysfs. + */ +int pci_request_region_exclusive(struct pci_dev *pdev, int bar, const char *res_name) +{ + return __pci_request_region(pdev, bar, res_name, IORESOURCE_EXCLUSIVE); +} +/** + * pci_release_selected_regions - Release selected PCI I/O and memory resources + * @pdev: PCI device whose resources were previously reserved + * @bars: Bitmask of BARs to be released + * + * Release selected PCI I/O and memory resources previously reserved. + * Call this function only after all use of the PCI regions has ceased. + */ +void pci_release_selected_regions(struct pci_dev *pdev, int bars) +{ + int i; + + for (i = 0; i < 6; i++) + if (bars & (1 << i)) + pci_release_region(pdev, i); +} + +int __pci_request_selected_regions(struct pci_dev *pdev, int bars, + const char *res_name, int excl) +{ + int i; + + for (i = 0; i < 6; i++) + if (bars & (1 << i)) + if (__pci_request_region(pdev, i, res_name, excl)) + goto err_out; + return 0; + +err_out: + while(--i >= 0) + if (bars & (1 << i)) + pci_release_region(pdev, i); + + return -EBUSY; +} + + +/** + * pci_request_selected_regions - Reserve selected PCI I/O and memory resources + * @pdev: PCI device whose resources are to be reserved + * @bars: Bitmask of BARs to be requested + * @res_name: Name to be associated with resource + */ +int pci_request_selected_regions(struct pci_dev *pdev, int bars, + const char *res_name) +{ + return __pci_request_selected_regions(pdev, bars, res_name, 0); +} + +int pci_request_selected_regions_exclusive(struct pci_dev *pdev, + int bars, const char *res_name) +{ + return __pci_request_selected_regions(pdev, bars, res_name, + IORESOURCE_EXCLUSIVE); +} + +/** + * pci_release_regions - Release reserved PCI I/O and memory resources + * @pdev: PCI device whose resources were previously reserved by pci_request_regions + * + * Releases all PCI I/O and memory resources previously reserved by a + * successful call to pci_request_regions. Call this function only + * after all use of the PCI regions has ceased. + */ + +void pci_release_regions(struct pci_dev *pdev) +{ + pci_release_selected_regions(pdev, (1 << 6) - 1); +} + +/** + * pci_request_regions - Reserved PCI I/O and memory resources + * @pdev: PCI device whose resources are to be reserved + * @res_name: Name to be associated with resource. + * + * Mark all PCI regions associated with PCI device @pdev as + * being reserved by owner @res_name. Do not access any + * address inside the PCI regions unless this call returns + * successfully. + * + * Returns 0 on success, or %EBUSY on error. A warning + * message is also printed on failure. + */ +int pci_request_regions(struct pci_dev *pdev, const char *res_name) +{ + return pci_request_selected_regions(pdev, ((1 << 6) - 1), res_name); +} + +/** + * pci_request_regions_exclusive - Reserved PCI I/O and memory resources + * @pdev: PCI device whose resources are to be reserved + * @res_name: Name to be associated with resource. + * + * Mark all PCI regions associated with PCI device @pdev as + * being reserved by owner @res_name. Do not access any + * address inside the PCI regions unless this call returns + * successfully. + * + * pci_request_regions_exclusive() will mark the region so that + * /dev/mem and the sysfs MMIO access will not be allowed. + * + * Returns 0 on success, or %EBUSY on error. A warning + * message is also printed on failure. + */ +int pci_request_regions_exclusive(struct pci_dev *pdev, const char *res_name) +{ + return pci_request_selected_regions_exclusive(pdev, + ((1 << 6) - 1), res_name); +} + +static void __pci_set_master(struct pci_dev *dev, bool enable) +{ + u16 old_cmd, cmd; + + pci_read_config_word(dev, PCI_COMMAND, &old_cmd); + if (enable) + cmd = old_cmd | PCI_COMMAND_MASTER; + else + cmd = old_cmd & ~PCI_COMMAND_MASTER; + if (cmd != old_cmd) { + dev_dbg(&dev->dev, "%s bus mastering\n", + enable ? "enabling" : "disabling"); + pci_write_config_word(dev, PCI_COMMAND, cmd); + } + dev->is_busmaster = enable; +} + +/** + * pci_set_master - enables bus-mastering for device dev + * @dev: the PCI device to enable + * + * Enables bus-mastering on the device and calls pcibios_set_master() + * to do the needed arch specific settings. + */ +void pci_set_master(struct pci_dev *dev) +{ + __pci_set_master(dev, true); + pcibios_set_master(dev); +} + +/** + * pci_clear_master - disables bus-mastering for device dev + * @dev: the PCI device to disable + */ +void pci_clear_master(struct pci_dev *dev) +{ + __pci_set_master(dev, false); +} + +#ifdef PCI_DISABLE_MWI +int pci_set_mwi(struct pci_dev *dev) +{ + return 0; +} + +int pci_try_set_mwi(struct pci_dev *dev) +{ + return 0; +} + +void pci_clear_mwi(struct pci_dev *dev) +{ +} + +#else + +#ifndef PCI_CACHE_LINE_BYTES +#define PCI_CACHE_LINE_BYTES L1_CACHE_BYTES +#endif + +/* This can be overridden by arch code. */ +/* Don't forget this is measured in 32-bit words, not bytes */ +u8 pci_cache_line_size = PCI_CACHE_LINE_BYTES / 4; + +/** + * pci_set_cacheline_size - ensure the CACHE_LINE_SIZE register is programmed + * @dev: the PCI device for which MWI is to be enabled + * + * Helper function for pci_set_mwi. + * Originally copied from drivers/net/acenic.c. + * Copyright 1998-2001 by Jes Sorensen, <jes@trained-monkey.org>. + * + * RETURNS: An appropriate -ERRNO error value on error, or zero for success. + */ +static int +pci_set_cacheline_size(struct pci_dev *dev) +{ + u8 cacheline_size; + + if (!pci_cache_line_size) + return -EINVAL; /* The system doesn't support MWI. */ + + /* Validate current setting: the PCI_CACHE_LINE_SIZE must be + equal to or multiple of the right value. */ + pci_read_config_byte(dev, PCI_CACHE_LINE_SIZE, &cacheline_size); + if (cacheline_size >= pci_cache_line_size && + (cacheline_size % pci_cache_line_size) == 0) + return 0; + + /* Write the correct value. */ + pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, pci_cache_line_size); + /* Read it back. */ + pci_read_config_byte(dev, PCI_CACHE_LINE_SIZE, &cacheline_size); + if (cacheline_size == pci_cache_line_size) + return 0; + + dev_printk(KERN_DEBUG, &dev->dev, "cache line size of %d is not " + "supported\n", pci_cache_line_size << 2); + + return -EINVAL; +} + +/** + * pci_set_mwi - enables memory-write-invalidate PCI transaction + * @dev: the PCI device for which MWI is enabled + * + * Enables the Memory-Write-Invalidate transaction in %PCI_COMMAND. + * + * RETURNS: An appropriate -ERRNO error value on error, or zero for success. + */ +int +pci_set_mwi(struct pci_dev *dev) +{ + int rc; + u16 cmd; + + rc = pci_set_cacheline_size(dev); + if (rc) + return rc; + + pci_read_config_word(dev, PCI_COMMAND, &cmd); + if (! (cmd & PCI_COMMAND_INVALIDATE)) { + dev_dbg(&dev->dev, "enabling Mem-Wr-Inval\n"); + cmd |= PCI_COMMAND_INVALIDATE; + pci_write_config_word(dev, PCI_COMMAND, cmd); + } + + return 0; +} + +/** + * pci_try_set_mwi - enables memory-write-invalidate PCI transaction + * @dev: the PCI device for which MWI is enabled + * + * Enables the Memory-Write-Invalidate transaction in %PCI_COMMAND. + * Callers are not required to check the return value. + * + * RETURNS: An appropriate -ERRNO error value on error, or zero for success. + */ +int pci_try_set_mwi(struct pci_dev *dev) +{ + int rc = pci_set_mwi(dev); + return rc; +} + +/** + * pci_clear_mwi - disables Memory-Write-Invalidate for device dev + * @dev: the PCI device to disable + * + * Disables PCI Memory-Write-Invalidate transaction on the device + */ +void +pci_clear_mwi(struct pci_dev *dev) +{ + u16 cmd; + + pci_read_config_word(dev, PCI_COMMAND, &cmd); + if (cmd & PCI_COMMAND_INVALIDATE) { + cmd &= ~PCI_COMMAND_INVALIDATE; + pci_write_config_word(dev, PCI_COMMAND, cmd); + } +} +#endif /* ! PCI_DISABLE_MWI */ + +/** + * pci_intx - enables/disables PCI INTx for device dev + * @pdev: the PCI device to operate on + * @enable: boolean: whether to enable or disable PCI INTx + * + * Enables/disables PCI INTx for device dev + */ +void +pci_intx(struct pci_dev *pdev, int enable) +{ + u16 pci_command, new; + + pci_read_config_word(pdev, PCI_COMMAND, &pci_command); + + if (enable) { + new = pci_command & ~PCI_COMMAND_INTX_DISABLE; + } else { + new = pci_command | PCI_COMMAND_INTX_DISABLE; + } + + if (new != pci_command) { + struct pci_devres *dr; + + pci_write_config_word(pdev, PCI_COMMAND, new); + + dr = find_pci_dr(pdev); + if (dr && !dr->restore_intx) { + dr->restore_intx = 1; + dr->orig_intx = !enable; + } + } +} + +/** + * pci_msi_off - disables any msi or msix capabilities + * @dev: the PCI device to operate on + * + * If you want to use msi see pci_enable_msi and friends. + * This is a lower level primitive that allows us to disable + * msi operation at the device level. + */ +void pci_msi_off(struct pci_dev *dev) +{ + int pos; + u16 control; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + if (pos) { + pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); + control &= ~PCI_MSI_FLAGS_ENABLE; + pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control); + } + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + if (pos) { + pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control); + control &= ~PCI_MSIX_FLAGS_ENABLE; + pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control); + } +} + +#ifndef HAVE_ARCH_PCI_SET_DMA_MASK +/* + * These can be overridden by arch-specific implementations + */ +int +pci_set_dma_mask(struct pci_dev *dev, u64 mask) +{ + if (!pci_dma_supported(dev, mask)) + return -EIO; + + dev->dma_mask = mask; + + return 0; +} + +int +pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask) +{ + if (!pci_dma_supported(dev, mask)) + return -EIO; + + dev->dev.coherent_dma_mask = mask; + + return 0; +} +#endif + +#ifndef HAVE_ARCH_PCI_SET_DMA_MAX_SEGMENT_SIZE +int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size) +{ + return dma_set_max_seg_size(&dev->dev, size); +} +EXPORT_SYMBOL(pci_set_dma_max_seg_size); +#endif + +#ifndef HAVE_ARCH_PCI_SET_DMA_SEGMENT_BOUNDARY +int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask) +{ + return dma_set_seg_boundary(&dev->dev, mask); +} +EXPORT_SYMBOL(pci_set_dma_seg_boundary); +#endif + +static int __pcie_flr(struct pci_dev *dev, int probe) +{ + u16 status; + u32 cap; + int exppos = pci_find_capability(dev, PCI_CAP_ID_EXP); + + if (!exppos) + return -ENOTTY; + pci_read_config_dword(dev, exppos + PCI_EXP_DEVCAP, &cap); + if (!(cap & PCI_EXP_DEVCAP_FLR)) + return -ENOTTY; + + if (probe) + return 0; + + pci_block_user_cfg_access(dev); + + /* Wait for Transaction Pending bit clean */ + msleep(100); + pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status); + if (status & PCI_EXP_DEVSTA_TRPND) { + dev_info(&dev->dev, "Busy after 100ms while trying to reset; " + "sleeping for 1 second\n"); + ssleep(1); + pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status); + if (status & PCI_EXP_DEVSTA_TRPND) + dev_info(&dev->dev, "Still busy after 1s; " + "proceeding with reset anyway\n"); + } + + pci_write_config_word(dev, exppos + PCI_EXP_DEVCTL, + PCI_EXP_DEVCTL_BCR_FLR); + mdelay(100); + + pci_unblock_user_cfg_access(dev); + return 0; +} + +static int __pci_af_flr(struct pci_dev *dev, int probe) +{ + int cappos = pci_find_capability(dev, PCI_CAP_ID_AF); + u8 status; + u8 cap; + + if (!cappos) + return -ENOTTY; + pci_read_config_byte(dev, cappos + PCI_AF_CAP, &cap); + if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR)) + return -ENOTTY; + + if (probe) + return 0; + + pci_block_user_cfg_access(dev); + + /* Wait for Transaction Pending bit clean */ + msleep(100); + pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status); + if (status & PCI_AF_STATUS_TP) { + dev_info(&dev->dev, "Busy after 100ms while trying to" + " reset; sleeping for 1 second\n"); + ssleep(1); + pci_read_config_byte(dev, + cappos + PCI_AF_STATUS, &status); + if (status & PCI_AF_STATUS_TP) + dev_info(&dev->dev, "Still busy after 1s; " + "proceeding with reset anyway\n"); + } + pci_write_config_byte(dev, cappos + PCI_AF_CTRL, PCI_AF_CTRL_FLR); + mdelay(100); + + pci_unblock_user_cfg_access(dev); + return 0; +} + +static int __pci_reset_function(struct pci_dev *pdev, int probe) +{ + int res; + + res = __pcie_flr(pdev, probe); + if (res != -ENOTTY) + return res; + + res = __pci_af_flr(pdev, probe); + if (res != -ENOTTY) + return res; + + return res; +} + +/** + * pci_execute_reset_function() - Reset a PCI device function + * @dev: Device function to reset + * + * Some devices allow an individual function to be reset without affecting + * other functions in the same device. The PCI device must be responsive + * to PCI config space in order to use this function. + * + * The device function is presumed to be unused when this function is called. + * Resetting the device will make the contents of PCI configuration space + * random, so any caller of this must be prepared to reinitialise the + * device including MSI, bus mastering, BARs, decoding IO and memory spaces, + * etc. + * + * Returns 0 if the device function was successfully reset or -ENOTTY if the + * device doesn't support resetting a single function. + */ +int pci_execute_reset_function(struct pci_dev *dev) +{ + return __pci_reset_function(dev, 0); +} +EXPORT_SYMBOL_GPL(pci_execute_reset_function); + +/** + * pci_reset_function() - quiesce and reset a PCI device function + * @dev: Device function to reset + * + * Some devices allow an individual function to be reset without affecting + * other functions in the same device. The PCI device must be responsive + * to PCI config space in order to use this function. + * + * This function does not just reset the PCI portion of a device, but + * clears all the state associated with the device. This function differs + * from pci_execute_reset_function in that it saves and restores device state + * over the reset. + * + * Returns 0 if the device function was successfully reset or -ENOTTY if the + * device doesn't support resetting a single function. + */ +int pci_reset_function(struct pci_dev *dev) +{ + int r = __pci_reset_function(dev, 1); + + if (r < 0) + return r; + + if (!dev->msi_enabled && !dev->msix_enabled && dev->irq != 0) + disable_irq(dev->irq); + pci_save_state(dev); + + pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); + + r = pci_execute_reset_function(dev); + + pci_restore_state(dev); + if (!dev->msi_enabled && !dev->msix_enabled && dev->irq != 0) + enable_irq(dev->irq); + + return r; +} +EXPORT_SYMBOL_GPL(pci_reset_function); + +/** + * pcix_get_max_mmrbc - get PCI-X maximum designed memory read byte count + * @dev: PCI device to query + * + * Returns mmrbc: maximum designed memory read count in bytes + * or appropriate error value. + */ +int pcix_get_max_mmrbc(struct pci_dev *dev) +{ + int err, cap; + u32 stat; + + cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (!cap) + return -EINVAL; + + err = pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat); + if (err) + return -EINVAL; + + return (stat & PCI_X_STATUS_MAX_READ) >> 12; +} +EXPORT_SYMBOL(pcix_get_max_mmrbc); + +/** + * pcix_get_mmrbc - get PCI-X maximum memory read byte count + * @dev: PCI device to query + * + * Returns mmrbc: maximum memory read count in bytes + * or appropriate error value. + */ +int pcix_get_mmrbc(struct pci_dev *dev) +{ + int ret, cap; + u32 cmd; + + cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (!cap) + return -EINVAL; + + ret = pci_read_config_dword(dev, cap + PCI_X_CMD, &cmd); + if (!ret) + ret = 512 << ((cmd & PCI_X_CMD_MAX_READ) >> 2); + + return ret; +} +EXPORT_SYMBOL(pcix_get_mmrbc); + +/** + * pcix_set_mmrbc - set PCI-X maximum memory read byte count + * @dev: PCI device to query + * @mmrbc: maximum memory read count in bytes + * valid values are 512, 1024, 2048, 4096 + * + * If possible sets maximum memory read byte count, some bridges have erratas + * that prevent this. + */ +int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc) +{ + int cap, err = -EINVAL; + u32 stat, cmd, v, o; + + if (mmrbc < 512 || mmrbc > 4096 || !is_power_of_2(mmrbc)) + goto out; + + v = ffs(mmrbc) - 10; + + cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (!cap) + goto out; + + err = pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat); + if (err) + goto out; + + if (v > (stat & PCI_X_STATUS_MAX_READ) >> 21) + return -E2BIG; + + err = pci_read_config_dword(dev, cap + PCI_X_CMD, &cmd); + if (err) + goto out; + + o = (cmd & PCI_X_CMD_MAX_READ) >> 2; + if (o != v) { + if (v > o && dev->bus && + (dev->bus->bus_flags & PCI_BUS_FLAGS_NO_MMRBC)) + return -EIO; + + cmd &= ~PCI_X_CMD_MAX_READ; + cmd |= v << 2; + err = pci_write_config_dword(dev, cap + PCI_X_CMD, cmd); + } +out: + return err; +} +EXPORT_SYMBOL(pcix_set_mmrbc); + +/** + * pcie_get_readrq - get PCI Express read request size + * @dev: PCI device to query + * + * Returns maximum memory read request in bytes + * or appropriate error value. + */ +int pcie_get_readrq(struct pci_dev *dev) +{ + int ret, cap; + u16 ctl; + + cap = pci_find_capability(dev, PCI_CAP_ID_EXP); + if (!cap) + return -EINVAL; + + ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); + if (!ret) + ret = 128 << ((ctl & PCI_EXP_DEVCTL_READRQ) >> 12); + + return ret; +} +EXPORT_SYMBOL(pcie_get_readrq); + +/** + * pcie_set_readrq - set PCI Express maximum memory read request + * @dev: PCI device to query + * @rq: maximum memory read count in bytes + * valid values are 128, 256, 512, 1024, 2048, 4096 + * + * If possible sets maximum read byte count + */ +int pcie_set_readrq(struct pci_dev *dev, int rq) +{ + int cap, err = -EINVAL; + u16 ctl, v; + + if (rq < 128 || rq > 4096 || !is_power_of_2(rq)) + goto out; + + v = (ffs(rq) - 8) << 12; + + cap = pci_find_capability(dev, PCI_CAP_ID_EXP); + if (!cap) + goto out; + + err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); + if (err) + goto out; + + if ((ctl & PCI_EXP_DEVCTL_READRQ) != v) { + ctl &= ~PCI_EXP_DEVCTL_READRQ; + ctl |= v; + err = pci_write_config_dword(dev, cap + PCI_EXP_DEVCTL, ctl); + } + +out: + return err; +} +EXPORT_SYMBOL(pcie_set_readrq); + +/** + * pci_select_bars - Make BAR mask from the type of resource + * @dev: the PCI device for which BAR mask is made + * @flags: resource type mask to be selected + * + * This helper routine makes bar mask from the type of resource. + */ +int pci_select_bars(struct pci_dev *dev, unsigned long flags) +{ + int i, bars = 0; + for (i = 0; i < PCI_NUM_RESOURCES; i++) + if (pci_resource_flags(dev, i) & flags) + bars |= (1 << i); + return bars; +} + +/** + * pci_resource_bar - get position of the BAR associated with a resource + * @dev: the PCI device + * @resno: the resource number + * @type: the BAR type to be filled in + * + * Returns BAR position in config space, or 0 if the BAR is invalid. + */ +int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type) +{ + if (resno < PCI_ROM_RESOURCE) { + *type = pci_bar_unknown; + return PCI_BASE_ADDRESS_0 + 4 * resno; + } else if (resno == PCI_ROM_RESOURCE) { + *type = pci_bar_mem32; + return dev->rom_base_reg; + } + + dev_err(&dev->dev, "BAR: invalid resource #%d\n", resno); + return 0; +} + +static void __devinit pci_no_domains(void) +{ +#ifdef CONFIG_PCI_DOMAINS + pci_domains_supported = 0; +#endif +} + +/** + * pci_ext_cfg_enabled - can we access extended PCI config space? + * @dev: The PCI device of the root bridge. + * + * Returns 1 if we can access PCI extended config space (offsets + * greater than 0xff). This is the default implementation. Architecture + * implementations can override this. + */ +int __attribute__ ((weak)) pci_ext_cfg_avail(struct pci_dev *dev) +{ + return 1; +} + +#ifndef DDE_LINUX +static +#endif +int __devinit pci_init(void) +{ + struct pci_dev *dev = NULL; + + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + pci_fixup_device(pci_fixup_final, dev); + } + + return 0; +} + +static int __init pci_setup(char *str) +{ +#ifndef DDE_LINUX + while (str) { + char *k = strchr(str, ','); + if (k) + *k++ = 0; + if (*str && (str = pcibios_setup(str)) && *str) { + if (!strcmp(str, "nomsi")) { + pci_no_msi(); + } else if (!strcmp(str, "noaer")) { + pci_no_aer(); + } else if (!strcmp(str, "nodomains")) { + pci_no_domains(); + } else if (!strncmp(str, "cbiosize=", 9)) { + pci_cardbus_io_size = memparse(str + 9, &str); + } else if (!strncmp(str, "cbmemsize=", 10)) { + pci_cardbus_mem_size = memparse(str + 10, &str); + } else { + printk(KERN_ERR "PCI: Unknown option `%s'\n", + str); + } + } + str = k; + } +#endif + return 0; +} +early_param("pci", pci_setup); + +device_initcall(pci_init); + +EXPORT_SYMBOL(pci_reenable_device); +EXPORT_SYMBOL(pci_enable_device_io); +EXPORT_SYMBOL(pci_enable_device_mem); +EXPORT_SYMBOL(pci_enable_device); +EXPORT_SYMBOL(pcim_enable_device); +EXPORT_SYMBOL(pcim_pin_device); +EXPORT_SYMBOL(pci_disable_device); +EXPORT_SYMBOL(pci_find_capability); +EXPORT_SYMBOL(pci_bus_find_capability); +EXPORT_SYMBOL(pci_release_regions); +EXPORT_SYMBOL(pci_request_regions); +EXPORT_SYMBOL(pci_request_regions_exclusive); +EXPORT_SYMBOL(pci_release_region); +EXPORT_SYMBOL(pci_request_region); +EXPORT_SYMBOL(pci_request_region_exclusive); +EXPORT_SYMBOL(pci_release_selected_regions); +EXPORT_SYMBOL(pci_request_selected_regions); +EXPORT_SYMBOL(pci_request_selected_regions_exclusive); +EXPORT_SYMBOL(pci_set_master); +EXPORT_SYMBOL(pci_clear_master); +EXPORT_SYMBOL(pci_set_mwi); +EXPORT_SYMBOL(pci_try_set_mwi); +EXPORT_SYMBOL(pci_clear_mwi); +EXPORT_SYMBOL_GPL(pci_intx); +EXPORT_SYMBOL(pci_set_dma_mask); +EXPORT_SYMBOL(pci_set_consistent_dma_mask); +EXPORT_SYMBOL(pci_assign_resource); +EXPORT_SYMBOL(pci_find_parent_resource); +EXPORT_SYMBOL(pci_select_bars); + +EXPORT_SYMBOL(pci_set_power_state); +EXPORT_SYMBOL(pci_save_state); +EXPORT_SYMBOL(pci_restore_state); +EXPORT_SYMBOL(pci_pme_capable); +EXPORT_SYMBOL(pci_pme_active); +EXPORT_SYMBOL(pci_enable_wake); +EXPORT_SYMBOL(pci_wake_from_d3); +EXPORT_SYMBOL(pci_target_state); +EXPORT_SYMBOL(pci_prepare_to_sleep); +EXPORT_SYMBOL(pci_back_from_sleep); +EXPORT_SYMBOL_GPL(pci_set_pcie_reset_state); + diff --git a/libdde-linux26/lib/src/drivers/pci/probe.c b/libdde-linux26/lib/src/drivers/pci/probe.c new file mode 100644 index 00000000..32da5108 --- /dev/null +++ b/libdde-linux26/lib/src/drivers/pci/probe.c @@ -0,0 +1,1232 @@ +/* + * probe.c - PCI detection and setup code + */ + +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/cpumask.h> +#include <linux/pci-aspm.h> +#include "pci.h" + +#define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */ +#define CARDBUS_RESERVE_BUSNR 3 + +/* Ugh. Need to stop exporting this to modules. */ +LIST_HEAD(pci_root_buses); +EXPORT_SYMBOL(pci_root_buses); + +#ifdef DDE_LINUX +#include "local.h" +#endif + +static int find_anything(struct device *dev, void *data) +{ + return 1; +} + +/* + * Some device drivers need know if pci is initiated. + * Basically, we think pci is not initiated when there + * is no device to be found on the pci_bus_type. + */ +int no_pci_devices(void) +{ + struct device *dev; + int no_devices; + + dev = bus_find_device(&pci_bus_type, NULL, NULL, find_anything); + no_devices = (dev == NULL); + put_device(dev); + return no_devices; +} +EXPORT_SYMBOL(no_pci_devices); + +/* + * PCI Bus Class Devices + */ +static ssize_t pci_bus_show_cpuaffinity(struct device *dev, + int type, + struct device_attribute *attr, + char *buf) +{ + int ret; + const struct cpumask *cpumask; + + cpumask = cpumask_of_pcibus(to_pci_bus(dev)); + ret = type? + cpulist_scnprintf(buf, PAGE_SIZE-2, cpumask) : + cpumask_scnprintf(buf, PAGE_SIZE-2, cpumask); + buf[ret++] = '\n'; + buf[ret] = '\0'; + return ret; +} + +static ssize_t inline pci_bus_show_cpumaskaffinity(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return pci_bus_show_cpuaffinity(dev, 0, attr, buf); +} + +static ssize_t inline pci_bus_show_cpulistaffinity(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return pci_bus_show_cpuaffinity(dev, 1, attr, buf); +} + +DEVICE_ATTR(cpuaffinity, S_IRUGO, pci_bus_show_cpumaskaffinity, NULL); +DEVICE_ATTR(cpulistaffinity, S_IRUGO, pci_bus_show_cpulistaffinity, NULL); + +/* + * PCI Bus Class + */ +static void release_pcibus_dev(struct device *dev) +{ + struct pci_bus *pci_bus = to_pci_bus(dev); + + if (pci_bus->bridge) + put_device(pci_bus->bridge); + kfree(pci_bus); +} + +static struct class pcibus_class = { + .name = "pci_bus", + .dev_release = &release_pcibus_dev, +}; + +static int __init pcibus_class_init(void) +{ + return class_register(&pcibus_class); +} +postcore_initcall(pcibus_class_init); + +/* + * Translate the low bits of the PCI base + * to the resource type + */ +static inline unsigned int pci_calc_resource_flags(unsigned int flags) +{ + if (flags & PCI_BASE_ADDRESS_SPACE_IO) + return IORESOURCE_IO; + + if (flags & PCI_BASE_ADDRESS_MEM_PREFETCH) + return IORESOURCE_MEM | IORESOURCE_PREFETCH; + + return IORESOURCE_MEM; +} + +static u64 pci_size(u64 base, u64 maxbase, u64 mask) +{ + u64 size = mask & maxbase; /* Find the significant bits */ + if (!size) + return 0; + + /* Get the lowest of them to find the decode size, and + from that the extent. */ + size = (size & ~(size-1)) - 1; + + /* base == maxbase can be valid only if the BAR has + already been programmed with all 1s. */ + if (base == maxbase && ((base | size) & mask) != mask) + return 0; + + return size; +} + +static inline enum pci_bar_type decode_bar(struct resource *res, u32 bar) +{ + if ((bar & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) { + res->flags = bar & ~PCI_BASE_ADDRESS_IO_MASK; + return pci_bar_io; + } + + res->flags = bar & ~PCI_BASE_ADDRESS_MEM_MASK; + + if (res->flags & PCI_BASE_ADDRESS_MEM_TYPE_64) + return pci_bar_mem64; + return pci_bar_mem32; +} + +/** + * pci_read_base - read a PCI BAR + * @dev: the PCI device + * @type: type of the BAR + * @res: resource buffer to be filled in + * @pos: BAR position in the config space + * + * Returns 1 if the BAR is 64-bit, or 0 if 32-bit. + */ +int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, + struct resource *res, unsigned int pos) +{ + u32 l, sz, mask; + + mask = type ? ~PCI_ROM_ADDRESS_ENABLE : ~0; + + res->name = pci_name(dev); + + pci_read_config_dword(dev, pos, &l); + pci_write_config_dword(dev, pos, mask); + pci_read_config_dword(dev, pos, &sz); + pci_write_config_dword(dev, pos, l); + + /* + * All bits set in sz means the device isn't working properly. + * If the BAR isn't implemented, all bits must be 0. If it's a + * memory BAR or a ROM, bit 0 must be clear; if it's an io BAR, bit + * 1 must be clear. + */ + if (!sz || sz == 0xffffffff) + goto fail; + + /* + * I don't know how l can have all bits set. Copied from old code. + * Maybe it fixes a bug on some ancient platform. + */ + if (l == 0xffffffff) + l = 0; + + if (type == pci_bar_unknown) { + type = decode_bar(res, l); + res->flags |= pci_calc_resource_flags(l) | IORESOURCE_SIZEALIGN; + if (type == pci_bar_io) { + l &= PCI_BASE_ADDRESS_IO_MASK; + mask = PCI_BASE_ADDRESS_IO_MASK & 0xffff; + } else { + l &= PCI_BASE_ADDRESS_MEM_MASK; + mask = (u32)PCI_BASE_ADDRESS_MEM_MASK; + } + } else { + res->flags |= (l & IORESOURCE_ROM_ENABLE); + l &= PCI_ROM_ADDRESS_MASK; + mask = (u32)PCI_ROM_ADDRESS_MASK; + } + + if (type == pci_bar_mem64) { + u64 l64 = l; + u64 sz64 = sz; + u64 mask64 = mask | (u64)~0 << 32; + + pci_read_config_dword(dev, pos + 4, &l); + pci_write_config_dword(dev, pos + 4, ~0); + pci_read_config_dword(dev, pos + 4, &sz); + pci_write_config_dword(dev, pos + 4, l); + + l64 |= ((u64)l << 32); + sz64 |= ((u64)sz << 32); + + sz64 = pci_size(l64, sz64, mask64); + + if (!sz64) + goto fail; + + if ((sizeof(resource_size_t) < 8) && (sz64 > 0x100000000ULL)) { + dev_err(&dev->dev, "can't handle 64-bit BAR\n"); + goto fail; + } else if ((sizeof(resource_size_t) < 8) && l) { + /* Address above 32-bit boundary; disable the BAR */ + pci_write_config_dword(dev, pos, 0); + pci_write_config_dword(dev, pos + 4, 0); + res->start = 0; + res->end = sz64; + } else { + res->start = l64; + res->end = l64 + sz64; + dev_printk(KERN_DEBUG, &dev->dev, + "reg %x 64bit mmio: %pR\n", pos, res); + } + } else { + sz = pci_size(l, sz, mask); + + if (!sz) + goto fail; + + res->start = l; + res->end = l + sz; + + dev_printk(KERN_DEBUG, &dev->dev, "reg %x %s: %pR\n", pos, + (res->flags & IORESOURCE_IO) ? "io port" : "32bit mmio", + res); + } + + out: + return (type == pci_bar_mem64) ? 1 : 0; + fail: + res->flags = 0; + goto out; +} + +static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) +{ + unsigned int pos, reg; + + for (pos = 0; pos < howmany; pos++) { + struct resource *res = &dev->resource[pos]; + reg = PCI_BASE_ADDRESS_0 + (pos << 2); + pos += __pci_read_base(dev, pci_bar_unknown, res, reg); + } + + if (rom) { + struct resource *res = &dev->resource[PCI_ROM_RESOURCE]; + dev->rom_base_reg = rom; + res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH | + IORESOURCE_READONLY | IORESOURCE_CACHEABLE | + IORESOURCE_SIZEALIGN; + __pci_read_base(dev, pci_bar_mem32, res, rom); + } +} + +void __devinit pci_read_bridge_bases(struct pci_bus *child) +{ + struct pci_dev *dev = child->self; + u8 io_base_lo, io_limit_lo; + u16 mem_base_lo, mem_limit_lo; + unsigned long base, limit; + struct resource *res; + int i; + + if (!dev) /* It's a host bus, nothing to read */ + return; + + if (dev->transparent) { + dev_info(&dev->dev, "transparent bridge\n"); + for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++) + child->resource[i] = child->parent->resource[i - 3]; + } + + res = child->resource[0]; + pci_read_config_byte(dev, PCI_IO_BASE, &io_base_lo); + pci_read_config_byte(dev, PCI_IO_LIMIT, &io_limit_lo); + base = (io_base_lo & PCI_IO_RANGE_MASK) << 8; + limit = (io_limit_lo & PCI_IO_RANGE_MASK) << 8; + + if ((io_base_lo & PCI_IO_RANGE_TYPE_MASK) == PCI_IO_RANGE_TYPE_32) { + u16 io_base_hi, io_limit_hi; + pci_read_config_word(dev, PCI_IO_BASE_UPPER16, &io_base_hi); + pci_read_config_word(dev, PCI_IO_LIMIT_UPPER16, &io_limit_hi); + base |= (io_base_hi << 16); + limit |= (io_limit_hi << 16); + } + + if (base <= limit) { + res->flags = (io_base_lo & PCI_IO_RANGE_TYPE_MASK) | IORESOURCE_IO; + if (!res->start) + res->start = base; + if (!res->end) + res->end = limit + 0xfff; + dev_printk(KERN_DEBUG, &dev->dev, "bridge io port: %pR\n", res); + } + + res = child->resource[1]; + pci_read_config_word(dev, PCI_MEMORY_BASE, &mem_base_lo); + pci_read_config_word(dev, PCI_MEMORY_LIMIT, &mem_limit_lo); + base = (mem_base_lo & PCI_MEMORY_RANGE_MASK) << 16; + limit = (mem_limit_lo & PCI_MEMORY_RANGE_MASK) << 16; + if (base <= limit) { + res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM; + res->start = base; + res->end = limit + 0xfffff; + dev_printk(KERN_DEBUG, &dev->dev, "bridge 32bit mmio: %pR\n", + res); + } + + res = child->resource[2]; + pci_read_config_word(dev, PCI_PREF_MEMORY_BASE, &mem_base_lo); + pci_read_config_word(dev, PCI_PREF_MEMORY_LIMIT, &mem_limit_lo); + base = (mem_base_lo & PCI_PREF_RANGE_MASK) << 16; + limit = (mem_limit_lo & PCI_PREF_RANGE_MASK) << 16; + + if ((mem_base_lo & PCI_PREF_RANGE_TYPE_MASK) == PCI_PREF_RANGE_TYPE_64) { + u32 mem_base_hi, mem_limit_hi; + pci_read_config_dword(dev, PCI_PREF_BASE_UPPER32, &mem_base_hi); + pci_read_config_dword(dev, PCI_PREF_LIMIT_UPPER32, &mem_limit_hi); + + /* + * Some bridges set the base > limit by default, and some + * (broken) BIOSes do not initialize them. If we find + * this, just assume they are not being used. + */ + if (mem_base_hi <= mem_limit_hi) { +#if BITS_PER_LONG == 64 + base |= ((long) mem_base_hi) << 32; + limit |= ((long) mem_limit_hi) << 32; +#else + if (mem_base_hi || mem_limit_hi) { + dev_err(&dev->dev, "can't handle 64-bit " + "address space for bridge\n"); + return; + } +#endif + } + } + if (base <= limit) { + res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM | IORESOURCE_PREFETCH; + res->start = base; + res->end = limit + 0xfffff; + dev_printk(KERN_DEBUG, &dev->dev, "bridge %sbit mmio pref: %pR\n", + (res->flags & PCI_PREF_RANGE_TYPE_64) ? "64" : "32", + res); + } +} + +static struct pci_bus * pci_alloc_bus(void) +{ + struct pci_bus *b; + + b = kzalloc(sizeof(*b), GFP_KERNEL); + if (b) { + INIT_LIST_HEAD(&b->node); + INIT_LIST_HEAD(&b->children); + INIT_LIST_HEAD(&b->devices); + INIT_LIST_HEAD(&b->slots); + } + return b; +} + +static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent, + struct pci_dev *bridge, int busnr) +{ + struct pci_bus *child; + int i; + + /* + * Allocate a new bus, and inherit stuff from the parent.. + */ + child = pci_alloc_bus(); + if (!child) + return NULL; + + child->parent = parent; + child->ops = parent->ops; + child->sysdata = parent->sysdata; + child->bus_flags = parent->bus_flags; + + /* initialize some portions of the bus device, but don't register it + * now as the parent is not properly set up yet. This device will get + * registered later in pci_bus_add_devices() + */ + child->dev.class = &pcibus_class; + dev_set_name(&child->dev, "%04x:%02x", pci_domain_nr(child), busnr); + + /* + * Set up the primary, secondary and subordinate + * bus numbers. + */ + child->number = child->secondary = busnr; + child->primary = parent->secondary; + child->subordinate = 0xff; + + if (!bridge) + return child; + + child->self = bridge; + child->bridge = get_device(&bridge->dev); + + /* Set up default resource pointers and names.. */ + for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) { + child->resource[i] = &bridge->resource[PCI_BRIDGE_RESOURCES+i]; + child->resource[i]->name = child->name; + } + bridge->subordinate = child; + + return child; +} + +struct pci_bus *__ref pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr) +{ + struct pci_bus *child; + + child = pci_alloc_child_bus(parent, dev, busnr); + if (child) { + down_write(&pci_bus_sem); + list_add_tail(&child->node, &parent->children); + up_write(&pci_bus_sem); + } + return child; +} + +static void pci_fixup_parent_subordinate_busnr(struct pci_bus *child, int max) +{ + struct pci_bus *parent = child->parent; + +#ifndef DDE_LINUX + /* Attempts to fix that up are really dangerous unless + we're going to re-assign all bus numbers. */ + if (!pcibios_assign_all_busses()) + return; +#endif + + while (parent->parent && parent->subordinate < max) { + parent->subordinate = max; + pci_write_config_byte(parent->self, PCI_SUBORDINATE_BUS, max); + parent = parent->parent; + } +} + +/* + * If it's a bridge, configure it and scan the bus behind it. + * For CardBus bridges, we don't scan behind as the devices will + * be handled by the bridge driver itself. + * + * We need to process bridges in two passes -- first we scan those + * already configured by the BIOS and after we are done with all of + * them, we proceed to assigning numbers to the remaining buses in + * order to avoid overlaps between old and new bus numbers. + */ +int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass) +{ + struct pci_bus *child; + int is_cardbus = (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS); + u32 buses, i, j = 0; + u16 bctl; + int broken = 0; + + pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses); + + dev_dbg(&dev->dev, "scanning behind bridge, config %06x, pass %d\n", + buses & 0xffffff, pass); + + /* Check if setup is sensible at all */ + if (!pass && + ((buses & 0xff) != bus->number || ((buses >> 8) & 0xff) <= bus->number)) { + dev_dbg(&dev->dev, "bus configuration invalid, reconfiguring\n"); + broken = 1; + } + + /* Disable MasterAbortMode during probing to avoid reporting + of bus errors (in some architectures) */ + pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl); + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, + bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT); + + if ((buses & 0xffff00) && !pcibios_assign_all_busses() && !is_cardbus && !broken) { + unsigned int cmax, busnr; + /* + * Bus already configured by firmware, process it in the first + * pass and just note the configuration. + */ + if (pass) + goto out; + busnr = (buses >> 8) & 0xFF; + + /* + * If we already got to this bus through a different bridge, + * ignore it. This can happen with the i450NX chipset. + */ + if (pci_find_bus(pci_domain_nr(bus), busnr)) { + dev_info(&dev->dev, "bus %04x:%02x already known\n", + pci_domain_nr(bus), busnr); + goto out; + } + + child = pci_add_new_bus(bus, dev, busnr); + if (!child) + goto out; + child->primary = buses & 0xFF; + child->subordinate = (buses >> 16) & 0xFF; + child->bridge_ctl = bctl; + + cmax = pci_scan_child_bus(child); + if (cmax > max) + max = cmax; + if (child->subordinate > max) + max = child->subordinate; + } else { +#ifndef DDE_LINUX + /* + * We need to assign a number to this bus which we always + * do in the second pass. + */ + if (!pass) { + if (pcibios_assign_all_busses() || broken) + /* Temporarily disable forwarding of the + configuration cycles on all bridges in + this bus segment to avoid possible + conflicts in the second pass between two + bridges programmed with overlapping + bus ranges. */ + pci_write_config_dword(dev, PCI_PRIMARY_BUS, + buses & ~0xffffff); + goto out; + } +#endif /* DDE_LINUX */ + + /* Clear errors */ + pci_write_config_word(dev, PCI_STATUS, 0xffff); + + /* Prevent assigning a bus number that already exists. + * This can happen when a bridge is hot-plugged */ + if (pci_find_bus(pci_domain_nr(bus), max+1)) + goto out; + child = pci_add_new_bus(bus, dev, ++max); + buses = (buses & 0xff000000) + | ((unsigned int)(child->primary) << 0) + | ((unsigned int)(child->secondary) << 8) + | ((unsigned int)(child->subordinate) << 16); + + /* + * yenta.c forces a secondary latency timer of 176. + * Copy that behaviour here. + */ + if (is_cardbus) { + buses &= ~0xff000000; + buses |= CARDBUS_LATENCY_TIMER << 24; + } + + /* + * We need to blast all three values with a single write. + */ + pci_write_config_dword(dev, PCI_PRIMARY_BUS, buses); + + if (!is_cardbus) { + child->bridge_ctl = bctl; + /* + * Adjust subordinate busnr in parent buses. + * We do this before scanning for children because + * some devices may not be detected if the bios + * was lazy. + */ + pci_fixup_parent_subordinate_busnr(child, max); + /* Now we can scan all subordinate buses... */ + max = pci_scan_child_bus(child); + /* + * now fix it up again since we have found + * the real value of max. + */ + pci_fixup_parent_subordinate_busnr(child, max); + } else { + /* + * For CardBus bridges, we leave 4 bus numbers + * as cards with a PCI-to-PCI bridge can be + * inserted later. + */ + for (i=0; i<CARDBUS_RESERVE_BUSNR; i++) { + struct pci_bus *parent = bus; + if (pci_find_bus(pci_domain_nr(bus), + max+i+1)) + break; + while (parent->parent) { + if ((!pcibios_assign_all_busses()) && + (parent->subordinate > max) && + (parent->subordinate <= max+i)) { + j = 1; + } + parent = parent->parent; + } + if (j) { + /* + * Often, there are two cardbus bridges + * -- try to leave one valid bus number + * for each one. + */ + i /= 2; + break; + } + } + max += i; + pci_fixup_parent_subordinate_busnr(child, max); + } + /* + * Set the subordinate bus number to its real value. + */ + child->subordinate = max; + pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, max); + } + + sprintf(child->name, + (is_cardbus ? "PCI CardBus %04x:%02x" : "PCI Bus %04x:%02x"), + pci_domain_nr(bus), child->number); + + /* Has only triggered on CardBus, fixup is in yenta_socket */ + while (bus->parent) { + if ((child->subordinate > bus->subordinate) || + (child->number > bus->subordinate) || + (child->number < bus->number) || + (child->subordinate < bus->number)) { + pr_debug("PCI: Bus #%02x (-#%02x) is %s " + "hidden behind%s bridge #%02x (-#%02x)\n", + child->number, child->subordinate, + (bus->number > child->subordinate && + bus->subordinate < child->number) ? + "wholly" : "partially", + bus->self->transparent ? " transparent" : "", + bus->number, bus->subordinate); + } + bus = bus->parent; + } + +out: + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl); + + return max; +} + +/* + * Read interrupt line and base address registers. + * The architecture-dependent code can tweak these, of course. + */ +static void pci_read_irq(struct pci_dev *dev) +{ + unsigned char irq; + + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &irq); + dev->pin = irq; + if (irq) + pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq); + dev->irq = irq; +} + +#define LEGACY_IO_RESOURCE (IORESOURCE_IO | IORESOURCE_PCI_FIXED) + +/** + * pci_setup_device - fill in class and map information of a device + * @dev: the device structure to fill + * + * Initialize the device structure with information about the device's + * vendor,class,memory and IO-space addresses,IRQ lines etc. + * Called at initialisation of the PCI subsystem and by CardBus services. + * Returns 0 on success and -1 if unknown type of device (not normal, bridge + * or CardBus). + */ +static int pci_setup_device(struct pci_dev * dev) +{ + u32 class; + + dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(dev->bus), + dev->bus->number, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + pci_read_config_dword(dev, PCI_CLASS_REVISION, &class); + dev->revision = class & 0xff; + class >>= 8; /* upper 3 bytes */ + dev->class = class; + class >>= 8; + + dev_dbg(&dev->dev, "found [%04x:%04x] class %06x header type %02x\n", + dev->vendor, dev->device, class, dev->hdr_type); + + /* "Unknown power state" */ + dev->current_state = PCI_UNKNOWN; + + /* Early fixups, before probing the BARs */ + pci_fixup_device(pci_fixup_early, dev); + class = dev->class >> 8; + + switch (dev->hdr_type) { /* header type */ + case PCI_HEADER_TYPE_NORMAL: /* standard header */ + if (class == PCI_CLASS_BRIDGE_PCI) + goto bad; + pci_read_irq(dev); + pci_read_bases(dev, 6, PCI_ROM_ADDRESS); + pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor); + pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device); + + /* + * Do the ugly legacy mode stuff here rather than broken chip + * quirk code. Legacy mode ATA controllers have fixed + * addresses. These are not always echoed in BAR0-3, and + * BAR0-3 in a few cases contain junk! + */ + if (class == PCI_CLASS_STORAGE_IDE) { + u8 progif; + pci_read_config_byte(dev, PCI_CLASS_PROG, &progif); + if ((progif & 1) == 0) { + dev->resource[0].start = 0x1F0; + dev->resource[0].end = 0x1F7; + dev->resource[0].flags = LEGACY_IO_RESOURCE; + dev->resource[1].start = 0x3F6; + dev->resource[1].end = 0x3F6; + dev->resource[1].flags = LEGACY_IO_RESOURCE; + } + if ((progif & 4) == 0) { + dev->resource[2].start = 0x170; + dev->resource[2].end = 0x177; + dev->resource[2].flags = LEGACY_IO_RESOURCE; + dev->resource[3].start = 0x376; + dev->resource[3].end = 0x376; + dev->resource[3].flags = LEGACY_IO_RESOURCE; + } + } + break; + + case PCI_HEADER_TYPE_BRIDGE: /* bridge header */ + if (class != PCI_CLASS_BRIDGE_PCI) + goto bad; + /* The PCI-to-PCI bridge spec requires that subtractive + decoding (i.e. transparent) bridge must have programming + interface code of 0x01. */ + pci_read_irq(dev); + dev->transparent = ((dev->class & 0xff) == 1); + pci_read_bases(dev, 2, PCI_ROM_ADDRESS1); + break; + + case PCI_HEADER_TYPE_CARDBUS: /* CardBus bridge header */ + if (class != PCI_CLASS_BRIDGE_CARDBUS) + goto bad; + pci_read_irq(dev); + pci_read_bases(dev, 1, 0); + pci_read_config_word(dev, PCI_CB_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor); + pci_read_config_word(dev, PCI_CB_SUBSYSTEM_ID, &dev->subsystem_device); + break; + + default: /* unknown header */ + dev_err(&dev->dev, "unknown header type %02x, " + "ignoring device\n", dev->hdr_type); + return -1; + + bad: + dev_err(&dev->dev, "ignoring class %02x (doesn't match header " + "type %02x)\n", class, dev->hdr_type); + dev->class = PCI_CLASS_NOT_DEFINED; + } + + /* We found a fine healthy device, go go go... */ + return 0; +} + +static void pci_release_capabilities(struct pci_dev *dev) +{ + pci_vpd_release(dev); +} + +/** + * pci_release_dev - free a pci device structure when all users of it are finished. + * @dev: device that's been disconnected + * + * Will be called only by the device core when all users of this pci device are + * done. + */ +static void pci_release_dev(struct device *dev) +{ + struct pci_dev *pci_dev; + + pci_dev = to_pci_dev(dev); + pci_release_capabilities(pci_dev); + kfree(pci_dev); +} + +static void set_pcie_port_type(struct pci_dev *pdev) +{ + int pos; + u16 reg16; + + pos = pci_find_capability(pdev, PCI_CAP_ID_EXP); + if (!pos) + return; + pdev->is_pcie = 1; + pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, ®16); + pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4; +} + +/** + * pci_cfg_space_size - get the configuration space size of the PCI device. + * @dev: PCI device + * + * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express devices + * have 4096 bytes. Even if the device is capable, that doesn't mean we can + * access it. Maybe we don't have a way to generate extended config space + * accesses, or the device is behind a reverse Express bridge. So we try + * reading the dword at 0x100 which must either be 0 or a valid extended + * capability header. + */ +int pci_cfg_space_size_ext(struct pci_dev *dev) +{ + u32 status; + int pos = PCI_CFG_SPACE_SIZE; + + if (pci_read_config_dword(dev, pos, &status) != PCIBIOS_SUCCESSFUL) + goto fail; + if (status == 0xffffffff) + goto fail; + + return PCI_CFG_SPACE_EXP_SIZE; + + fail: + return PCI_CFG_SPACE_SIZE; +} + +int pci_cfg_space_size(struct pci_dev *dev) +{ + int pos; + u32 status; + + pos = pci_find_capability(dev, PCI_CAP_ID_EXP); + if (!pos) { + pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (!pos) + goto fail; + + pci_read_config_dword(dev, pos + PCI_X_STATUS, &status); + if (!(status & (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ))) + goto fail; + } + + return pci_cfg_space_size_ext(dev); + + fail: + return PCI_CFG_SPACE_SIZE; +} + +static void pci_release_bus_bridge_dev(struct device *dev) +{ + kfree(dev); +} + +struct pci_dev *alloc_pci_dev(void) +{ + struct pci_dev *dev; + + dev = kzalloc(sizeof(struct pci_dev), GFP_KERNEL); + if (!dev) + return NULL; + + INIT_LIST_HEAD(&dev->bus_list); + + return dev; +} +EXPORT_SYMBOL(alloc_pci_dev); + +/* + * Read the config data for a PCI device, sanity-check it + * and fill in the dev structure... + */ +static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) +{ + struct pci_dev *dev; + struct pci_slot *slot; + u32 l; + u8 hdr_type; + int delay = 1; + + if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, &l)) + return NULL; + + /* some broken boards return 0 or ~0 if a slot is empty: */ + if (l == 0xffffffff || l == 0x00000000 || + l == 0x0000ffff || l == 0xffff0000) + return NULL; + + /* Configuration request Retry Status */ + while (l == 0xffff0001) { + msleep(delay); + delay *= 2; + if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, &l)) + return NULL; + /* Card hasn't responded in 60 seconds? Must be stuck. */ + if (delay > 60 * 1000) { + printk(KERN_WARNING "pci %04x:%02x:%02x.%d: not " + "responding\n", pci_domain_nr(bus), + bus->number, PCI_SLOT(devfn), + PCI_FUNC(devfn)); + return NULL; + } + } + + if (pci_bus_read_config_byte(bus, devfn, PCI_HEADER_TYPE, &hdr_type)) + return NULL; + + dev = alloc_pci_dev(); + if (!dev) + return NULL; + + dev->bus = bus; + dev->sysdata = bus->sysdata; + dev->dev.parent = bus->bridge; + dev->dev.bus = &pci_bus_type; + dev->devfn = devfn; + dev->hdr_type = hdr_type & 0x7f; + dev->multifunction = !!(hdr_type & 0x80); + dev->vendor = l & 0xffff; + dev->device = (l >> 16) & 0xffff; + dev->cfg_size = pci_cfg_space_size(dev); + dev->error_state = pci_channel_io_normal; + set_pcie_port_type(dev); + + list_for_each_entry(slot, &bus->slots, list) + if (PCI_SLOT(devfn) == slot->number) + dev->slot = slot; + + /* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer) + set this higher, assuming the system even supports it. */ + dev->dma_mask = 0xffffffff; + if (pci_setup_device(dev) < 0) { + kfree(dev); + return NULL; + } + + return dev; +} + +static void pci_init_capabilities(struct pci_dev *dev) +{ + /* MSI/MSI-X list */ + pci_msi_init_pci_dev(dev); + + /* Buffers for saving PCIe and PCI-X capabilities */ + pci_allocate_cap_save_buffers(dev); + + /* Power Management */ + pci_pm_init(dev); + platform_pci_wakeup_init(dev); + + /* Vital Product Data */ + pci_vpd_pci22_init(dev); + + /* Alternative Routing-ID Forwarding */ + pci_enable_ari(dev); +} + +void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) +{ + device_initialize(&dev->dev); + dev->dev.release = pci_release_dev; + pci_dev_get(dev); + + dev->dev.dma_mask = &dev->dma_mask; + dev->dev.dma_parms = &dev->dma_parms; + dev->dev.coherent_dma_mask = 0xffffffffull; + + pci_set_dma_max_seg_size(dev, 65536); + pci_set_dma_seg_boundary(dev, 0xffffffff); + + /* Fix up broken headers */ + pci_fixup_device(pci_fixup_header, dev); + + /* Initialize various capabilities */ + pci_init_capabilities(dev); + + /* + * Add the device to our list of discovered devices + * and the bus list for fixup functions, etc. + */ + down_write(&pci_bus_sem); + list_add_tail(&dev->bus_list, &bus->devices); + up_write(&pci_bus_sem); +} + +struct pci_dev *__ref pci_scan_single_device(struct pci_bus *bus, int devfn) +{ + struct pci_dev *dev; + + dev = pci_scan_device(bus, devfn); + if (!dev) + return NULL; + + pci_device_add(dev, bus); + + return dev; +} +EXPORT_SYMBOL(pci_scan_single_device); + +/** + * pci_scan_slot - scan a PCI slot on a bus for devices. + * @bus: PCI bus to scan + * @devfn: slot number to scan (must have zero function.) + * + * Scan a PCI slot on the specified PCI bus for devices, adding + * discovered devices to the @bus->devices list. New devices + * will not have is_added set. + */ +int pci_scan_slot(struct pci_bus *bus, int devfn) +{ + int func, nr = 0; + int scan_all_fns; + + scan_all_fns = pcibios_scan_all_fns(bus, devfn); + + for (func = 0; func < 8; func++, devfn++) { + struct pci_dev *dev; + + dev = pci_scan_single_device(bus, devfn); + if (dev) { + nr++; + + /* + * If this is a single function device, + * don't scan past the first function. + */ + if (!dev->multifunction) { + if (func > 0) { + dev->multifunction = 1; + } else { + break; + } + } + } else { + if (func == 0 && !scan_all_fns) + break; + } + } + + /* only one slot has pcie device */ + if (bus->self && nr) + pcie_aspm_init_link_state(bus->self); + + return nr; +} + +unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus) +{ + unsigned int devfn, pass, max = bus->secondary; + struct pci_dev *dev; + + pr_debug("PCI: Scanning bus %04x:%02x\n", pci_domain_nr(bus), bus->number); + + /* Go find them, Rover! */ + for (devfn = 0; devfn < 0x100; devfn += 8) + pci_scan_slot(bus, devfn); + +#ifndef DDE_LINUX + /* + * After performing arch-dependent fixup of the bus, look behind + * all PCI-to-PCI bridges on this bus. + */ + pr_debug("PCI: Fixups for bus %04x:%02x\n", pci_domain_nr(bus), bus->number); + pcibios_fixup_bus(bus); + for (pass=0; pass < 2; pass++) + list_for_each_entry(dev, &bus->devices, bus_list) { + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || + dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) + max = pci_scan_bridge(bus, dev, max, pass); + } +#endif + + /* + * We've scanned the bus and so we know all about what's on + * the other side of any bridges that may be on this bus plus + * any devices. + * + * Return how far we've got finding sub-buses. + */ + pr_debug("PCI: Bus scan for %04x:%02x returning with max=%02x\n", + pci_domain_nr(bus), bus->number, max); + return max; +} + +void __attribute__((weak)) set_pci_bus_resources_arch_default(struct pci_bus *b) +{ +} + +struct pci_bus * pci_create_bus(struct device *parent, + int bus, struct pci_ops *ops, void *sysdata) +{ + int error; + struct pci_bus *b; + struct device *dev; + + b = pci_alloc_bus(); + if (!b) + return NULL; + + dev = kmalloc(sizeof(*dev), GFP_KERNEL); + if (!dev){ + kfree(b); + return NULL; + } + + b->sysdata = sysdata; + b->ops = ops; + + if (pci_find_bus(pci_domain_nr(b), bus)) { + /* If we already got to this bus through a different bridge, ignore it */ + pr_debug("PCI: Bus %04x:%02x already known\n", pci_domain_nr(b), bus); + goto err_out; + } + + down_write(&pci_bus_sem); + list_add_tail(&b->node, &pci_root_buses); + up_write(&pci_bus_sem); + + memset(dev, 0, sizeof(*dev)); + dev->parent = parent; + dev->release = pci_release_bus_bridge_dev; + dev_set_name(dev, "pci%04x:%02x", pci_domain_nr(b), bus); + error = device_register(dev); + if (error) + goto dev_reg_err; + b->bridge = get_device(dev); + + if (!parent) + set_dev_node(b->bridge, pcibus_to_node(b)); + + b->dev.class = &pcibus_class; + b->dev.parent = b->bridge; + dev_set_name(&b->dev, "%04x:%02x", pci_domain_nr(b), bus); + error = device_register(&b->dev); + if (error) + goto class_dev_reg_err; + error = device_create_file(&b->dev, &dev_attr_cpuaffinity); + if (error) + goto dev_create_file_err; + + /* Create legacy_io and legacy_mem files for this bus */ + pci_create_legacy_files(b); + + b->number = b->secondary = bus; + b->resource[0] = &ioport_resource; + b->resource[1] = &iomem_resource; + + set_pci_bus_resources_arch_default(b); + + return b; + +dev_create_file_err: + device_unregister(&b->dev); +class_dev_reg_err: + device_unregister(dev); +dev_reg_err: + down_write(&pci_bus_sem); + list_del(&b->node); + up_write(&pci_bus_sem); +err_out: + kfree(dev); + kfree(b); + return NULL; +} + +struct pci_bus * __devinit pci_scan_bus_parented(struct device *parent, + int bus, struct pci_ops *ops, void *sysdata) +{ + struct pci_bus *b; + + b = pci_create_bus(parent, bus, ops, sysdata); + if (b) + b->subordinate = pci_scan_child_bus(b); + return b; +} +EXPORT_SYMBOL(pci_scan_bus_parented); + +#ifdef CONFIG_HOTPLUG +EXPORT_SYMBOL(pci_add_new_bus); +EXPORT_SYMBOL(pci_scan_slot); +EXPORT_SYMBOL(pci_scan_bridge); +EXPORT_SYMBOL_GPL(pci_scan_child_bus); +#endif + +static int __init pci_sort_bf_cmp(const struct device *d_a, const struct device *d_b) +{ + const struct pci_dev *a = to_pci_dev(d_a); + const struct pci_dev *b = to_pci_dev(d_b); + + if (pci_domain_nr(a->bus) < pci_domain_nr(b->bus)) return -1; + else if (pci_domain_nr(a->bus) > pci_domain_nr(b->bus)) return 1; + + if (a->bus->number < b->bus->number) return -1; + else if (a->bus->number > b->bus->number) return 1; + + if (a->devfn < b->devfn) return -1; + else if (a->devfn > b->devfn) return 1; + + return 0; +} + +void __init pci_sort_breadthfirst(void) +{ + bus_sort_breadthfirst(&pci_bus_type, &pci_sort_bf_cmp); +} diff --git a/libdde-linux26/lib/src/fs/block_dev.c b/libdde-linux26/lib/src/fs/block_dev.c new file mode 100644 index 00000000..4c4c2f64 --- /dev/null +++ b/libdde-linux26/lib/src/fs/block_dev.c @@ -0,0 +1,1422 @@ +/* + * linux/fs/block_dev.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + */ + +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/fcntl.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/major.h> +#include <linux/smp_lock.h> +#include <linux/device_cgroup.h> +#include <linux/highmem.h> +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/blkpg.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/mpage.h> +#include <linux/mount.h> +#include <linux/uio.h> +#include <linux/namei.h> +#include <linux/log2.h> +#include <asm/uaccess.h> +#include "internal.h" + +#ifdef DDE_LINUX +#include "local.h" +#endif + +struct bdev_inode { + struct block_device bdev; + struct inode vfs_inode; +}; + +static const struct address_space_operations def_blk_aops; + +static inline struct bdev_inode *BDEV_I(struct inode *inode) +{ + return container_of(inode, struct bdev_inode, vfs_inode); +} + +inline struct block_device *I_BDEV(struct inode *inode) +{ + return &BDEV_I(inode)->bdev; +} + +EXPORT_SYMBOL(I_BDEV); + +static sector_t max_block(struct block_device *bdev) +{ + sector_t retval = ~((sector_t)0); + loff_t sz = i_size_read(bdev->bd_inode); + + if (sz) { + unsigned int size = block_size(bdev); + unsigned int sizebits = blksize_bits(size); + retval = (sz >> sizebits); + } + return retval; +} + +/* Kill _all_ buffers and pagecache , dirty or not.. */ +static void kill_bdev(struct block_device *bdev) +{ + if (bdev->bd_inode->i_mapping->nrpages == 0) + return; + invalidate_bh_lrus(); + truncate_inode_pages(bdev->bd_inode->i_mapping, 0); +} + +int set_blocksize(struct block_device *bdev, int size) +{ + /* Size must be a power of two, and between 512 and PAGE_SIZE */ + if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) + return -EINVAL; + + /* Size cannot be smaller than the size supported by the device */ + if (size < bdev_hardsect_size(bdev)) + return -EINVAL; + + /* Don't change the size if it is same as current */ + if (bdev->bd_block_size != size) { + sync_blockdev(bdev); + bdev->bd_block_size = size; + bdev->bd_inode->i_blkbits = blksize_bits(size); + kill_bdev(bdev); + } + return 0; +} + +EXPORT_SYMBOL(set_blocksize); + +int sb_set_blocksize(struct super_block *sb, int size) +{ + if (set_blocksize(sb->s_bdev, size)) + return 0; + /* If we get here, we know size is power of two + * and it's value is between 512 and PAGE_SIZE */ + sb->s_blocksize = size; + sb->s_blocksize_bits = blksize_bits(size); + return sb->s_blocksize; +} + +EXPORT_SYMBOL(sb_set_blocksize); + +int sb_min_blocksize(struct super_block *sb, int size) +{ + int minsize = bdev_hardsect_size(sb->s_bdev); + if (size < minsize) + size = minsize; + return sb_set_blocksize(sb, size); +} + +EXPORT_SYMBOL(sb_min_blocksize); + +static int +blkdev_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + if (iblock >= max_block(I_BDEV(inode))) { + if (create) + return -EIO; + + /* + * for reads, we're just trying to fill a partial page. + * return a hole, they will have to call get_block again + * before they can fill it, and they will get -EIO at that + * time + */ + return 0; + } + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + set_buffer_mapped(bh); + return 0; +} + +static int +blkdev_get_blocks(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + sector_t end_block = max_block(I_BDEV(inode)); + unsigned long max_blocks = bh->b_size >> inode->i_blkbits; + + if ((iblock + max_blocks) > end_block) { + max_blocks = end_block - iblock; + if ((long)max_blocks <= 0) { + if (create) + return -EIO; /* write fully beyond EOF */ + /* + * It is a read which is fully beyond EOF. We return + * a !buffer_mapped buffer + */ + max_blocks = 0; + } + } + + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + bh->b_size = max_blocks << inode->i_blkbits; + if (max_blocks) + set_buffer_mapped(bh); + return 0; +} + +static ssize_t +blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + +#ifndef DDE_LINUX + return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), + iov, offset, nr_segs, blkdev_get_blocks, NULL); +#else + WARN_UNIMPL; + return 0; +#endif /* DDE_LINUX */ +} + +static int blkdev_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, blkdev_get_block, wbc); +} + +static int blkdev_readpage(struct file * file, struct page * page) +{ + return block_read_full_page(page, blkdev_get_block); +} + +static int blkdev_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + *pagep = NULL; + return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + blkdev_get_block); +} + +static int blkdev_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + unlock_page(page); + page_cache_release(page); + + return ret; +} + +/* + * private llseek: + * for a block special file file->f_path.dentry->d_inode->i_size is zero + * so we compute the size by hand (just as in block_read/write above) + */ +static loff_t block_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *bd_inode = file->f_mapping->host; + loff_t size; + loff_t retval; + + mutex_lock(&bd_inode->i_mutex); + size = i_size_read(bd_inode); + + switch (origin) { + case 2: + offset += size; + break; + case 1: + offset += file->f_pos; + } + retval = -EINVAL; + if (offset >= 0 && offset <= size) { + if (offset != file->f_pos) { + file->f_pos = offset; + } + retval = offset; + } + mutex_unlock(&bd_inode->i_mutex); + return retval; +} + +/* + * Filp is never NULL; the only case when ->fsync() is called with + * NULL first argument is nfsd_sync_dir() and that's not a directory. + */ + +static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) +{ + return sync_blockdev(I_BDEV(filp->f_mapping->host)); +} + +/* + * pseudo-fs + */ + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); +static struct kmem_cache * bdev_cachep __read_mostly; + +static struct inode *bdev_alloc_inode(struct super_block *sb) +{ + struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void bdev_destroy_inode(struct inode *inode) +{ + struct bdev_inode *bdi = BDEV_I(inode); + + bdi->bdev.bd_inode_backing_dev_info = NULL; + kmem_cache_free(bdev_cachep, bdi); +} + +static void init_once(void *foo) +{ + struct bdev_inode *ei = (struct bdev_inode *) foo; + struct block_device *bdev = &ei->bdev; + + memset(bdev, 0, sizeof(*bdev)); + mutex_init(&bdev->bd_mutex); + sema_init(&bdev->bd_mount_sem, 1); + INIT_LIST_HEAD(&bdev->bd_inodes); + INIT_LIST_HEAD(&bdev->bd_list); +#ifdef CONFIG_SYSFS + INIT_LIST_HEAD(&bdev->bd_holder_list); +#endif + inode_init_once(&ei->vfs_inode); + /* Initialize mutex for freeze. */ + mutex_init(&bdev->bd_fsfreeze_mutex); +} + +static inline void __bd_forget(struct inode *inode) +{ + list_del_init(&inode->i_devices); + inode->i_bdev = NULL; + inode->i_mapping = &inode->i_data; +} + +static void bdev_clear_inode(struct inode *inode) +{ + struct block_device *bdev = &BDEV_I(inode)->bdev; + struct list_head *p; + spin_lock(&bdev_lock); + while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { + __bd_forget(list_entry(p, struct inode, i_devices)); + } + list_del_init(&bdev->bd_list); + spin_unlock(&bdev_lock); +} + +static const struct super_operations bdev_sops = { + .statfs = simple_statfs, + .alloc_inode = bdev_alloc_inode, + .destroy_inode = bdev_destroy_inode, + .drop_inode = generic_delete_inode, + .clear_inode = bdev_clear_inode, +}; + +static int bd_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); +} + +static struct file_system_type bd_type = { + .name = "bdev", + .get_sb = bd_get_sb, + .kill_sb = kill_anon_super, +}; + +struct super_block *blockdev_superblock __read_mostly; + +void __init bdev_cache_init(void) +{ + int err; + struct vfsmount *bd_mnt; + + bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), + 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_PANIC), + init_once); + err = register_filesystem(&bd_type); + if (err) + panic("Cannot register bdev pseudo-fs"); + bd_mnt = kern_mount(&bd_type); + if (IS_ERR(bd_mnt)) + panic("Cannot create bdev pseudo-fs"); + blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ +} + +/* + * Most likely _very_ bad one - but then it's hardly critical for small + * /dev and can be fixed when somebody will need really large one. + * Keep in mind that it will be fed through icache hash function too. + */ +static inline unsigned long hash(dev_t dev) +{ + return MAJOR(dev)+MINOR(dev); +} + +static int bdev_test(struct inode *inode, void *data) +{ + return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; +} + +static int bdev_set(struct inode *inode, void *data) +{ + BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; + return 0; +} + +static LIST_HEAD(all_bdevs); + +struct block_device *bdget(dev_t dev) +{ + struct block_device *bdev; + struct inode *inode; + + printk_all_partitions(); + + inode = iget5_locked(blockdev_superblock, hash(dev), + bdev_test, bdev_set, &dev); + + if (!inode) + return NULL; + + bdev = &BDEV_I(inode)->bdev; + + if (inode->i_state & I_NEW) { + bdev->bd_contains = NULL; + bdev->bd_inode = inode; + bdev->bd_block_size = (1 << inode->i_blkbits); + bdev->bd_part_count = 0; + bdev->bd_invalidated = 0; + inode->i_mode = S_IFBLK; + inode->i_rdev = dev; + inode->i_bdev = bdev; + inode->i_data.a_ops = &def_blk_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + inode->i_data.backing_dev_info = &default_backing_dev_info; + spin_lock(&bdev_lock); + list_add(&bdev->bd_list, &all_bdevs); + spin_unlock(&bdev_lock); + unlock_new_inode(inode); + } + return bdev; +} + +EXPORT_SYMBOL(bdget); + +long nr_blockdev_pages(void) +{ + struct block_device *bdev; + long ret = 0; + spin_lock(&bdev_lock); + list_for_each_entry(bdev, &all_bdevs, bd_list) { + ret += bdev->bd_inode->i_mapping->nrpages; + } + spin_unlock(&bdev_lock); + return ret; +} + +void bdput(struct block_device *bdev) +{ + iput(bdev->bd_inode); +} + +EXPORT_SYMBOL(bdput); + +static struct block_device *bd_acquire(struct inode *inode) +{ + struct block_device *bdev; + + spin_lock(&bdev_lock); + bdev = inode->i_bdev; + if (bdev) { + atomic_inc(&bdev->bd_inode->i_count); + spin_unlock(&bdev_lock); + return bdev; + } + spin_unlock(&bdev_lock); + + bdev = bdget(inode->i_rdev); + if (bdev) { + spin_lock(&bdev_lock); + if (!inode->i_bdev) { + /* + * We take an additional bd_inode->i_count for inode, + * and it's released in clear_inode() of inode. + * So, we can access it via ->i_mapping always + * without igrab(). + */ + atomic_inc(&bdev->bd_inode->i_count); + inode->i_bdev = bdev; + inode->i_mapping = bdev->bd_inode->i_mapping; + list_add(&inode->i_devices, &bdev->bd_inodes); + } + spin_unlock(&bdev_lock); + } + return bdev; +} + +/* Call when you free inode */ + +void bd_forget(struct inode *inode) +{ + struct block_device *bdev = NULL; + + spin_lock(&bdev_lock); + if (inode->i_bdev) { + if (!sb_is_blkdev_sb(inode->i_sb)) + bdev = inode->i_bdev; + __bd_forget(inode); + } + spin_unlock(&bdev_lock); + + if (bdev) + iput(bdev->bd_inode); +} + +int bd_claim(struct block_device *bdev, void *holder) +{ + int res; + spin_lock(&bdev_lock); + + /* first decide result */ + if (bdev->bd_holder == holder) + res = 0; /* already a holder */ + else if (bdev->bd_holder != NULL) + res = -EBUSY; /* held by someone else */ + else if (bdev->bd_contains == bdev) + res = 0; /* is a whole device which isn't held */ + + else if (bdev->bd_contains->bd_holder == bd_claim) + res = 0; /* is a partition of a device that is being partitioned */ + else if (bdev->bd_contains->bd_holder != NULL) + res = -EBUSY; /* is a partition of a held device */ + else + res = 0; /* is a partition of an un-held device */ + + /* now impose change */ + if (res==0) { + /* note that for a whole device bd_holders + * will be incremented twice, and bd_holder will + * be set to bd_claim before being set to holder + */ + bdev->bd_contains->bd_holders ++; + bdev->bd_contains->bd_holder = bd_claim; + bdev->bd_holders++; + bdev->bd_holder = holder; + } + spin_unlock(&bdev_lock); + return res; +} + +EXPORT_SYMBOL(bd_claim); + +void bd_release(struct block_device *bdev) +{ + spin_lock(&bdev_lock); + if (!--bdev->bd_contains->bd_holders) + bdev->bd_contains->bd_holder = NULL; + if (!--bdev->bd_holders) + bdev->bd_holder = NULL; + spin_unlock(&bdev_lock); +} + +EXPORT_SYMBOL(bd_release); + +#ifdef CONFIG_SYSFS +/* + * Functions for bd_claim_by_kobject / bd_release_from_kobject + * + * If a kobject is passed to bd_claim_by_kobject() + * and the kobject has a parent directory, + * following symlinks are created: + * o from the kobject to the claimed bdev + * o from "holders" directory of the bdev to the parent of the kobject + * bd_release_from_kobject() removes these symlinks. + * + * Example: + * If /dev/dm-0 maps to /dev/sda, kobject corresponding to + * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: + * /sys/block/dm-0/slaves/sda --> /sys/block/sda + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 + */ + +static int add_symlink(struct kobject *from, struct kobject *to) +{ + if (!from || !to) + return 0; + return sysfs_create_link(from, to, kobject_name(to)); +} + +static void del_symlink(struct kobject *from, struct kobject *to) +{ + if (!from || !to) + return; + sysfs_remove_link(from, kobject_name(to)); +} + +/* + * 'struct bd_holder' contains pointers to kobjects symlinked by + * bd_claim_by_kobject. + * It's connected to bd_holder_list which is protected by bdev->bd_sem. + */ +struct bd_holder { + struct list_head list; /* chain of holders of the bdev */ + int count; /* references from the holder */ + struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ + struct kobject *hdev; /* e.g. "/block/dm-0" */ + struct kobject *hdir; /* e.g. "/block/sda/holders" */ + struct kobject *sdev; /* e.g. "/block/sda" */ +}; + +/* + * Get references of related kobjects at once. + * Returns 1 on success. 0 on failure. + * + * Should call bd_holder_release_dirs() after successful use. + */ +static int bd_holder_grab_dirs(struct block_device *bdev, + struct bd_holder *bo) +{ + if (!bdev || !bo) + return 0; + + bo->sdir = kobject_get(bo->sdir); + if (!bo->sdir) + return 0; + + bo->hdev = kobject_get(bo->sdir->parent); + if (!bo->hdev) + goto fail_put_sdir; + + bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); + if (!bo->sdev) + goto fail_put_hdev; + + bo->hdir = kobject_get(bdev->bd_part->holder_dir); + if (!bo->hdir) + goto fail_put_sdev; + + return 1; + +fail_put_sdev: + kobject_put(bo->sdev); +fail_put_hdev: + kobject_put(bo->hdev); +fail_put_sdir: + kobject_put(bo->sdir); + + return 0; +} + +/* Put references of related kobjects at once. */ +static void bd_holder_release_dirs(struct bd_holder *bo) +{ + kobject_put(bo->hdir); + kobject_put(bo->sdev); + kobject_put(bo->hdev); + kobject_put(bo->sdir); +} + +static struct bd_holder *alloc_bd_holder(struct kobject *kobj) +{ + struct bd_holder *bo; + + bo = kzalloc(sizeof(*bo), GFP_KERNEL); + if (!bo) + return NULL; + + bo->count = 1; + bo->sdir = kobj; + + return bo; +} + +static void free_bd_holder(struct bd_holder *bo) +{ + kfree(bo); +} + +/** + * find_bd_holder - find matching struct bd_holder from the block device + * + * @bdev: struct block device to be searched + * @bo: target struct bd_holder + * + * Returns matching entry with @bo in @bdev->bd_holder_list. + * If found, increment the reference count and return the pointer. + * If not found, returns NULL. + */ +static struct bd_holder *find_bd_holder(struct block_device *bdev, + struct bd_holder *bo) +{ + struct bd_holder *tmp; + + list_for_each_entry(tmp, &bdev->bd_holder_list, list) + if (tmp->sdir == bo->sdir) { + tmp->count++; + return tmp; + } + + return NULL; +} + +/** + * add_bd_holder - create sysfs symlinks for bd_claim() relationship + * + * @bdev: block device to be bd_claimed + * @bo: preallocated and initialized by alloc_bd_holder() + * + * Add @bo to @bdev->bd_holder_list, create symlinks. + * + * Returns 0 if symlinks are created. + * Returns -ve if something fails. + */ +static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) +{ + int err; + + if (!bo) + return -EINVAL; + + if (!bd_holder_grab_dirs(bdev, bo)) + return -EBUSY; + + err = add_symlink(bo->sdir, bo->sdev); + if (err) + return err; + + err = add_symlink(bo->hdir, bo->hdev); + if (err) { + del_symlink(bo->sdir, bo->sdev); + return err; + } + + list_add_tail(&bo->list, &bdev->bd_holder_list); + return 0; +} + +/** + * del_bd_holder - delete sysfs symlinks for bd_claim() relationship + * + * @bdev: block device to be bd_claimed + * @kobj: holder's kobject + * + * If there is matching entry with @kobj in @bdev->bd_holder_list + * and no other bd_claim() from the same kobject, + * remove the struct bd_holder from the list, delete symlinks for it. + * + * Returns a pointer to the struct bd_holder when it's removed from the list + * and ready to be freed. + * Returns NULL if matching claim isn't found or there is other bd_claim() + * by the same kobject. + */ +static struct bd_holder *del_bd_holder(struct block_device *bdev, + struct kobject *kobj) +{ + struct bd_holder *bo; + + list_for_each_entry(bo, &bdev->bd_holder_list, list) { + if (bo->sdir == kobj) { + bo->count--; + BUG_ON(bo->count < 0); + if (!bo->count) { + list_del(&bo->list); + del_symlink(bo->sdir, bo->sdev); + del_symlink(bo->hdir, bo->hdev); + bd_holder_release_dirs(bo); + return bo; + } + break; + } + } + + return NULL; +} + +/** + * bd_claim_by_kobject - bd_claim() with additional kobject signature + * + * @bdev: block device to be claimed + * @holder: holder's signature + * @kobj: holder's kobject + * + * Do bd_claim() and if it succeeds, create sysfs symlinks between + * the bdev and the holder's kobject. + * Use bd_release_from_kobject() when relesing the claimed bdev. + * + * Returns 0 on success. (same as bd_claim()) + * Returns errno on failure. + */ +static int bd_claim_by_kobject(struct block_device *bdev, void *holder, + struct kobject *kobj) +{ + int err; + struct bd_holder *bo, *found; + + if (!kobj) + return -EINVAL; + + bo = alloc_bd_holder(kobj); + if (!bo) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + + err = bd_claim(bdev, holder); + if (err) + goto fail; + + found = find_bd_holder(bdev, bo); + if (found) + goto fail; + + err = add_bd_holder(bdev, bo); + if (err) + bd_release(bdev); + else + bo = NULL; +fail: + mutex_unlock(&bdev->bd_mutex); + free_bd_holder(bo); + return err; +} + +/** + * bd_release_from_kobject - bd_release() with additional kobject signature + * + * @bdev: block device to be released + * @kobj: holder's kobject + * + * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). + */ +static void bd_release_from_kobject(struct block_device *bdev, + struct kobject *kobj) +{ + if (!kobj) + return; + + mutex_lock(&bdev->bd_mutex); + bd_release(bdev); + free_bd_holder(del_bd_holder(bdev, kobj)); + mutex_unlock(&bdev->bd_mutex); +} + +/** + * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() + * + * @bdev: block device to be claimed + * @holder: holder's signature + * @disk: holder's gendisk + * + * Call bd_claim_by_kobject() with getting @disk->slave_dir. + */ +int bd_claim_by_disk(struct block_device *bdev, void *holder, + struct gendisk *disk) +{ + return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); +} +EXPORT_SYMBOL_GPL(bd_claim_by_disk); + +/** + * bd_release_from_disk - wrapper function for bd_release_from_kobject() + * + * @bdev: block device to be claimed + * @disk: holder's gendisk + * + * Call bd_release_from_kobject() and put @disk->slave_dir. + */ +void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) +{ + bd_release_from_kobject(bdev, disk->slave_dir); + kobject_put(disk->slave_dir); +} +EXPORT_SYMBOL_GPL(bd_release_from_disk); +#endif + +/* + * Tries to open block device by device number. Use it ONLY if you + * really do not have anything better - i.e. when you are behind a + * truly sucky interface and all you are given is a device number. _Never_ + * to be used for internal purposes. If you ever need it - reconsider + * your API. + */ +struct block_device *open_by_devnum(dev_t dev, fmode_t mode) +{ + struct block_device *bdev = bdget(dev); + int err = -ENOMEM; + if (bdev) + err = blkdev_get(bdev, mode); + return err ? ERR_PTR(err) : bdev; +} + +EXPORT_SYMBOL(open_by_devnum); + +/** + * flush_disk - invalidates all buffer-cache entries on a disk + * + * @bdev: struct block device to be flushed + * + * Invalidates all buffer-cache entries on a disk. It should be called + * when a disk has been changed -- either by a media change or online + * resize. + */ +static void flush_disk(struct block_device *bdev) +{ + if (__invalidate_device(bdev)) { + char name[BDEVNAME_SIZE] = ""; + + if (bdev->bd_disk) + disk_name(bdev->bd_disk, 0, name); + printk(KERN_WARNING "VFS: busy inodes on changed media or " + "resized disk %s\n", name); + } + + if (!bdev->bd_disk) + return; + if (disk_partitionable(bdev->bd_disk)) + bdev->bd_invalidated = 1; +} + +/** + * check_disk_size_change - checks for disk size change and adjusts bdev size. + * @disk: struct gendisk to check + * @bdev: struct bdev to adjust. + * + * This routine checks to see if the bdev size does not match the disk size + * and adjusts it if it differs. + */ +void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) +{ + loff_t disk_size, bdev_size; + + disk_size = (loff_t)get_capacity(disk) << 9; + bdev_size = i_size_read(bdev->bd_inode); + if (disk_size != bdev_size) { + char name[BDEVNAME_SIZE]; + + disk_name(disk, 0, name); + printk(KERN_INFO + "%s: detected capacity change from %lld to %lld\n", + name, bdev_size, disk_size); + i_size_write(bdev->bd_inode, disk_size); + flush_disk(bdev); + } +} +EXPORT_SYMBOL(check_disk_size_change); + +/** + * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back + * @disk: struct gendisk to be revalidated + * + * This routine is a wrapper for lower-level driver's revalidate_disk + * call-backs. It is used to do common pre and post operations needed + * for all revalidate_disk operations. + */ +int revalidate_disk(struct gendisk *disk) +{ + struct block_device *bdev; + int ret = 0; + + if (disk->fops->revalidate_disk) + ret = disk->fops->revalidate_disk(disk); + + bdev = bdget_disk(disk, 0); + if (!bdev) + return ret; + + mutex_lock(&bdev->bd_mutex); + check_disk_size_change(disk, bdev); + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return ret; +} +EXPORT_SYMBOL(revalidate_disk); + +/* + * This routine checks whether a removable media has been changed, + * and invalidates all buffer-cache-entries in that case. This + * is a relatively slow routine, so we have to try to minimize using + * it. Thus it is called only upon a 'mount' or 'open'. This + * is the best way of combining speed and utility, I think. + * People changing diskettes in the middle of an operation deserve + * to lose :-) + */ +int check_disk_change(struct block_device *bdev) +{ + struct gendisk *disk = bdev->bd_disk; + struct block_device_operations * bdops = disk->fops; + + if (!bdops->media_changed) + return 0; + if (!bdops->media_changed(bdev->bd_disk)) + return 0; + + flush_disk(bdev); + if (bdops->revalidate_disk) + bdops->revalidate_disk(bdev->bd_disk); + return 1; +} + +EXPORT_SYMBOL(check_disk_change); + +void bd_set_size(struct block_device *bdev, loff_t size) +{ + unsigned bsize = bdev_hardsect_size(bdev); + + bdev->bd_inode->i_size = size; + while (bsize < PAGE_CACHE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_block_size = bsize; + bdev->bd_inode->i_blkbits = blksize_bits(bsize); +} +EXPORT_SYMBOL(bd_set_size); + +static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); + +/* + * bd_mutex locking: + * + * mutex_lock(part->bd_mutex) + * mutex_lock_nested(whole->bd_mutex, 1) + */ + +static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) +{ + struct gendisk *disk; + int ret; + int partno; + int perm = 0; + + if (mode & FMODE_READ) + perm |= MAY_READ; + if (mode & FMODE_WRITE) + perm |= MAY_WRITE; + /* + * hooks: /n/, see "layering violations". + */ + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret != 0) { + bdput(bdev); + return ret; + } + + lock_kernel(); + restart: + + ret = -ENXIO; + disk = get_gendisk(bdev->bd_dev, &partno); + if (!disk) + goto out_unlock_kernel; + + mutex_lock_nested(&bdev->bd_mutex, for_part); + if (!bdev->bd_openers) { + bdev->bd_disk = disk; + bdev->bd_contains = bdev; + if (!partno) { + struct backing_dev_info *bdi; + + ret = -ENXIO; + bdev->bd_part = disk_get_part(disk, partno); + if (!bdev->bd_part) + goto out_clear; + + if (disk->fops->open) { + ret = disk->fops->open(bdev, mode); + if (ret == -ERESTARTSYS) { + /* Lost a race with 'disk' being + * deleted, try again. + * See md.c + */ + disk_put_part(bdev->bd_part); + bdev->bd_part = NULL; + module_put(disk->fops->owner); + put_disk(disk); + bdev->bd_disk = NULL; + mutex_unlock(&bdev->bd_mutex); + goto restart; + } + if (ret) + goto out_clear; + } + if (!bdev->bd_openers) { + bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + bdi = blk_get_backing_dev_info(bdev); + if (bdi == NULL) + bdi = &default_backing_dev_info; + bdev->bd_inode->i_data.backing_dev_info = bdi; + } + if (bdev->bd_invalidated) + rescan_partitions(disk, bdev); + } else { + struct block_device *whole; + whole = bdget_disk(disk, 0); + ret = -ENOMEM; + if (!whole) + goto out_clear; + BUG_ON(for_part); + ret = __blkdev_get(whole, mode, 1); + if (ret) + goto out_clear; + bdev->bd_contains = whole; + bdev->bd_inode->i_data.backing_dev_info = + whole->bd_inode->i_data.backing_dev_info; + bdev->bd_part = disk_get_part(disk, partno); + if (!(disk->flags & GENHD_FL_UP) || + !bdev->bd_part || !bdev->bd_part->nr_sects) { + ret = -ENXIO; + goto out_clear; + } + bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + } + } else { + put_disk(disk); + module_put(disk->fops->owner); + disk = NULL; + if (bdev->bd_contains == bdev) { + if (bdev->bd_disk->fops->open) { + ret = bdev->bd_disk->fops->open(bdev, mode); + if (ret) + goto out_unlock_bdev; + } + if (bdev->bd_invalidated) + rescan_partitions(bdev->bd_disk, bdev); + } + } + bdev->bd_openers++; + if (for_part) + bdev->bd_part_count++; + mutex_unlock(&bdev->bd_mutex); + unlock_kernel(); + return 0; + + out_clear: + disk_put_part(bdev->bd_part); + bdev->bd_disk = NULL; + bdev->bd_part = NULL; + bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; + if (bdev != bdev->bd_contains) + __blkdev_put(bdev->bd_contains, mode, 1); + bdev->bd_contains = NULL; + out_unlock_bdev: + mutex_unlock(&bdev->bd_mutex); + out_unlock_kernel: + unlock_kernel(); + + if (disk) + module_put(disk->fops->owner); + put_disk(disk); + bdput(bdev); + + return ret; +} + +int blkdev_get(struct block_device *bdev, fmode_t mode) +{ + return __blkdev_get(bdev, mode, 0); +} +EXPORT_SYMBOL(blkdev_get); + +static int blkdev_open(struct inode * inode, struct file * filp) +{ + struct block_device *bdev; + int res; + + /* + * Preserve backwards compatibility and allow large file access + * even if userspace doesn't ask for it explicitly. Some mkfs + * binary needs it. We might want to drop this workaround + * during an unstable branch. + */ + filp->f_flags |= O_LARGEFILE; + + if (filp->f_flags & O_NDELAY) + filp->f_mode |= FMODE_NDELAY; + if (filp->f_flags & O_EXCL) + filp->f_mode |= FMODE_EXCL; + if ((filp->f_flags & O_ACCMODE) == 3) + filp->f_mode |= FMODE_WRITE_IOCTL; + + bdev = bd_acquire(inode); + if (bdev == NULL) + return -ENOMEM; + + filp->f_mapping = bdev->bd_inode->i_mapping; + + res = blkdev_get(bdev, filp->f_mode); + if (res) + return res; + + if (filp->f_mode & FMODE_EXCL) { + res = bd_claim(bdev, filp); + if (res) + goto out_blkdev_put; + } + + return 0; + + out_blkdev_put: + blkdev_put(bdev, filp->f_mode); + return res; +} + +static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) +{ + int ret = 0; + struct gendisk *disk = bdev->bd_disk; + struct block_device *victim = NULL; + + mutex_lock_nested(&bdev->bd_mutex, for_part); + lock_kernel(); + if (for_part) + bdev->bd_part_count--; + + if (!--bdev->bd_openers) { + sync_blockdev(bdev); + kill_bdev(bdev); + } + if (bdev->bd_contains == bdev) { + if (disk->fops->release) + ret = disk->fops->release(disk, mode); + } + if (!bdev->bd_openers) { + struct module *owner = disk->fops->owner; + + put_disk(disk); + module_put(owner); + disk_put_part(bdev->bd_part); + bdev->bd_part = NULL; + bdev->bd_disk = NULL; + bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; + if (bdev != bdev->bd_contains) + victim = bdev->bd_contains; + bdev->bd_contains = NULL; + } + unlock_kernel(); + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + if (victim) + __blkdev_put(victim, mode, 1); + return ret; +} + +int blkdev_put(struct block_device *bdev, fmode_t mode) +{ + return __blkdev_put(bdev, mode, 0); +} +EXPORT_SYMBOL(blkdev_put); + +static int blkdev_close(struct inode * inode, struct file * filp) +{ + struct block_device *bdev = I_BDEV(filp->f_mapping->host); + if (bdev->bd_holder == filp) + bd_release(bdev); + return blkdev_put(bdev, filp->f_mode); +} + +static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + struct block_device *bdev = I_BDEV(file->f_mapping->host); + fmode_t mode = file->f_mode; + + /* + * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have + * to updated it before every ioctl. + */ + if (file->f_flags & O_NDELAY) + mode |= FMODE_NDELAY; + else + mode &= ~FMODE_NDELAY; + + return blkdev_ioctl(bdev, mode, cmd, arg); +} + +/* + * Try to release a page associated with block device when the system + * is under memory pressure. + */ +static int blkdev_releasepage(struct page *page, gfp_t wait) +{ + struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; + + if (super && super->s_op->bdev_try_to_free_page) + return super->s_op->bdev_try_to_free_page(super, page, wait); + + return try_to_free_buffers(page); +} + +static const struct address_space_operations def_blk_aops = { + .readpage = blkdev_readpage, + .writepage = blkdev_writepage, + .sync_page = block_sync_page, + .write_begin = blkdev_write_begin, + .write_end = blkdev_write_end, + .writepages = generic_writepages, + .releasepage = blkdev_releasepage, + .direct_IO = blkdev_direct_IO, +}; + +const struct file_operations def_blk_fops = { + .open = blkdev_open, + .release = blkdev_close, +#ifndef DDE_LINUX + .llseek = block_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write_nolock, + .mmap = generic_file_mmap, + .fsync = block_fsync, + .unlocked_ioctl = block_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_blkdev_ioctl, +#endif + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +#endif /* DDE_LINUX */ +}; + +int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) +{ + int res; + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + res = blkdev_ioctl(bdev, 0, cmd, arg); + set_fs(old_fs); + return res; +} + +EXPORT_SYMBOL(ioctl_by_bdev); + +/** + * lookup_bdev - lookup a struct block_device by name + * @pathname: special file representing the block device + * + * Get a reference to the blockdevice at @pathname in the current + * namespace if possible and return it. Return ERR_PTR(error) + * otherwise. + */ +struct block_device *lookup_bdev(const char *pathname) +{ + struct block_device *bdev; + struct inode *inode; + struct path path; + int error; + + if (!pathname || !*pathname) + return ERR_PTR(-EINVAL); + + error = kern_path(pathname, LOOKUP_FOLLOW, &path); + if (error) + return ERR_PTR(error); + + inode = path.dentry->d_inode; + error = -ENOTBLK; + if (!S_ISBLK(inode->i_mode)) + goto fail; + error = -EACCES; + if (path.mnt->mnt_flags & MNT_NODEV) + goto fail; + error = -ENOMEM; + bdev = bd_acquire(inode); + if (!bdev) + goto fail; +out: + path_put(&path); + return bdev; +fail: + bdev = ERR_PTR(error); + goto out; +} +EXPORT_SYMBOL(lookup_bdev); + +/** + * open_bdev_exclusive - open a block device by name and set it up for use + * + * @path: special file representing the block device + * @mode: FMODE_... combination to pass be used + * @holder: owner for exclusion + * + * Open the blockdevice described by the special file at @path, claim it + * for the @holder. + */ +struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) +{ + struct block_device *bdev; + int error = 0; + + bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + return bdev; + + error = blkdev_get(bdev, mode); + if (error) + return ERR_PTR(error); + error = -EACCES; + if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) + goto blkdev_put; + error = bd_claim(bdev, holder); + if (error) + goto blkdev_put; + + return bdev; + +blkdev_put: + blkdev_put(bdev, mode); + return ERR_PTR(error); +} + +EXPORT_SYMBOL(open_bdev_exclusive); + +/** + * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive() + * + * @bdev: blockdevice to close + * @mode: mode, must match that used to open. + * + * This is the counterpart to open_bdev_exclusive(). + */ +void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) +{ + bd_release(bdev); + blkdev_put(bdev, mode); +} + +EXPORT_SYMBOL(close_bdev_exclusive); + +int __invalidate_device(struct block_device *bdev) +{ + struct super_block *sb = get_super(bdev); + int res = 0; + + if (sb) { + /* + * no need to lock the super, get_super holds the + * read mutex so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb); + drop_super(sb); + } + invalidate_bdev(bdev); + return res; +} +EXPORT_SYMBOL(__invalidate_device); diff --git a/libdde-linux26/lib/src/fs/buffer.c b/libdde-linux26/lib/src/fs/buffer.c new file mode 100644 index 00000000..d3b1c445 --- /dev/null +++ b/libdde-linux26/lib/src/fs/buffer.c @@ -0,0 +1,3474 @@ +/* + * linux/fs/buffer.c + * + * Copyright (C) 1991, 1992, 2002 Linus Torvalds + */ + +/* + * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 + * + * Removed a lot of unnecessary code and simplified things now that + * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 + * + * Speed up hash, lru, and free list operations. Use gfp() for allocating + * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM + * + * Added 32k buffer block sizes - these are required older ARM systems. - RMK + * + * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> + */ + +#include <linux/kernel.h> +#include <linux/syscalls.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/capability.h> +#include <linux/blkdev.h> +#include <linux/file.h> +#include <linux/quotaops.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/writeback.h> +#include <linux/hash.h> +#include <linux/suspend.h> +#include <linux/buffer_head.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/bio.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/bitops.h> +#include <linux/mpage.h> +#include <linux/bit_spinlock.h> + +static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); + +#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) + +inline void +init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) +{ + bh->b_end_io = handler; + bh->b_private = private; +} + +static int sync_buffer(void *word) +{ + struct block_device *bd; + struct buffer_head *bh + = container_of(word, struct buffer_head, b_state); + + smp_mb(); + bd = bh->b_bdev; + if (bd) + blk_run_address_space(bd->bd_inode->i_mapping); + io_schedule(); + return 0; +} + +void __lock_buffer(struct buffer_head *bh) +{ + wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, + TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(__lock_buffer); + +void unlock_buffer(struct buffer_head *bh) +{ + clear_bit_unlock(BH_Lock, &bh->b_state); + smp_mb__after_clear_bit(); + wake_up_bit(&bh->b_state, BH_Lock); +} + +/* + * Block until a buffer comes unlocked. This doesn't stop it + * from becoming locked again - you have to lock it yourself + * if you want to preserve its state. + */ +void __wait_on_buffer(struct buffer_head * bh) +{ + wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); +} + +static void +__clear_page_buffers(struct page *page) +{ + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); +} + + +static int quiet_error(struct buffer_head *bh) +{ + if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit()) + return 0; + return 1; +} + + +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +/* + * End-of-IO handler helper function which does not touch the bh after + * unlocking it. + * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but + * a race there is benign: unlock_buffer() only use the bh's address for + * hashing after unlocking the buffer, so it doesn't actually touch the bh + * itself. + */ +static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + } else { + /* This happens, due to failed READA attempts. */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); +} + +/* + * Default synchronous end-of-IO handler.. Just mark it up-to-date and + * unlock the buffer. This is what ll_rw_block uses too. + */ +void end_buffer_read_sync(struct buffer_head *bh, int uptodate) +{ + __end_buffer_read_notouch(bh, uptodate); + put_bh(bh); +} + +void end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { + buffer_io_error(bh); + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + set_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +/* + * Write out and wait upon all the dirty data associated with a block + * device via its mapping. Does not take the superblock lock. + */ +int sync_blockdev(struct block_device *bdev) +{ +#ifndef DDE_LINUX + int ret = 0; + + if (bdev) + ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); + return ret; +#else + WARN_UNIMPL; + return 0; +#endif /* DDE_LINUX */ +} +EXPORT_SYMBOL(sync_blockdev); + +/* + * Write out and wait upon all dirty data associated with this + * device. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int fsync_bdev(struct block_device *bdev) +{ +#ifndef DDE_LINUX + struct super_block *sb = get_super(bdev); + if (sb) { + int res = fsync_super(sb); + drop_super(sb); + return res; + } + return sync_blockdev(bdev); +#else + WARN_UNIMPL; + return -1; +#endif +} + +/** + * freeze_bdev -- lock a filesystem and force it into a consistent state + * @bdev: blockdevice to lock + * + * This takes the block device bd_mount_sem to make sure no new mounts + * happen on bdev until thaw_bdev() is called. + * If a superblock is found on this device, we take the s_umount semaphore + * on it to make sure nobody unmounts until the snapshot creation is done. + * The reference counter (bd_fsfreeze_count) guarantees that only the last + * unfreeze process can unfreeze the frozen filesystem actually when multiple + * freeze requests arrive simultaneously. It counts up in freeze_bdev() and + * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * actually. + */ +struct super_block *freeze_bdev(struct block_device *bdev) +{ + struct super_block *sb; + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + bdev->bd_fsfreeze_count++; + sb = get_super(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; + } + bdev->bd_fsfreeze_count++; + + down(&bdev->bd_mount_sem); + sb = get_super(bdev); + if (sb && !(sb->s_flags & MS_RDONLY)) { + sb->s_frozen = SB_FREEZE_WRITE; + smp_wmb(); + + __fsync_super(sb); + + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + + sync_blockdev(sb->s_bdev); + + if (sb->s_op->freeze_fs) { + error = sb->s_op->freeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + drop_super(sb); + up(&bdev->bd_mount_sem); + bdev->bd_fsfreeze_count--; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return ERR_PTR(error); + } + } + } + + sync_blockdev(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + + return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ +} +EXPORT_SYMBOL(freeze_bdev); + +/** + * thaw_bdev -- unlock filesystem + * @bdev: blockdevice to unlock + * @sb: associated superblock + * + * Unlocks the filesystem and marks it writeable again after freeze_bdev(). + */ +int thaw_bdev(struct block_device *bdev, struct super_block *sb) +{ + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (!bdev->bd_fsfreeze_count) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return -EINVAL; + } + + bdev->bd_fsfreeze_count--; + if (bdev->bd_fsfreeze_count > 0) { + if (sb) + drop_super(sb); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; + } + + if (sb) { + BUG_ON(sb->s_bdev != bdev); + if (!(sb->s_flags & MS_RDONLY)) { + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; + } + } + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + } + drop_super(sb); + } + + up(&bdev->bd_mount_sem); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; +} +EXPORT_SYMBOL(thaw_bdev); + +/* + * Various filesystems appear to want __find_get_block to be non-blocking. + * But it's the page lock which protects the buffers. To get around this, + * we get exclusion from try_to_free_buffers with the blockdev mapping's + * private_lock. + * + * Hack idea: for the blockdev mapping, i_bufferlist_lock contention + * may be quite high. This code could TryLock the page, and if that + * succeeds, there is no need to take private_lock. (But if + * private_lock is contended then so is mapping->tree_lock). + */ +static struct buffer_head * +__find_get_block_slow(struct block_device *bdev, sector_t block) +{ + struct inode *bd_inode = bdev->bd_inode; + struct address_space *bd_mapping = bd_inode->i_mapping; + struct buffer_head *ret = NULL; + pgoff_t index; + struct buffer_head *bh; + struct buffer_head *head; + struct page *page; + int all_mapped = 1; + + index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); + page = find_get_page(bd_mapping, index); + if (!page) + goto out; + + spin_lock(&bd_mapping->private_lock); + if (!page_has_buffers(page)) + goto out_unlock; + head = page_buffers(page); + bh = head; + do { + if (bh->b_blocknr == block) { + ret = bh; + get_bh(bh); + goto out_unlock; + } + if (!buffer_mapped(bh)) + all_mapped = 0; + bh = bh->b_this_page; + } while (bh != head); + + /* we might be here because some of the buffers on this page are + * not mapped. This is due to various races between + * file io on the block device and getblk. It gets dealt with + * elsewhere, don't buffer_error if we had some unmapped buffers + */ + if (all_mapped) { + printk("__find_get_block_slow() failed. " + "block=%llu, b_blocknr=%llu\n", + (unsigned long long)block, + (unsigned long long)bh->b_blocknr); + printk("b_state=0x%08lx, b_size=%zu\n", + bh->b_state, bh->b_size); + printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); + } +out_unlock: + spin_unlock(&bd_mapping->private_lock); + page_cache_release(page); +out: + return ret; +} + +/* If invalidate_buffers() will trash dirty buffers, it means some kind + of fs corruption is going on. Trashing dirty data always imply losing + information that was supposed to be just stored on the physical layer + by the user. + + Thus invalidate_buffers in general usage is not allwowed to trash + dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to + be preserved. These buffers are simply skipped. + + We also skip buffers which are still in use. For example this can + happen if a userspace program is reading the block device. + + NOTE: In the case where the user removed a removable-media-disk even if + there's still dirty data not synced on disk (due a bug in the device driver + or due an error of the user), by not destroying the dirty buffers we could + generate corruption also on the next media inserted, thus a parameter is + necessary to handle this case in the most safe way possible (trying + to not corrupt also the new disk inserted with the data belonging to + the old now corrupted disk). Also for the ramdisk the natural thing + to do in order to release the ramdisk memory is to destroy dirty buffers. + + These are two special cases. Normal usage imply the device driver + to issue a sync on the device (without waiting I/O completion) and + then an invalidate_buffers call that doesn't trash dirty buffers. + + For handling cache coherency with the blkdev pagecache the 'update' case + is been introduced. It is needed to re-read from disk any pinned + buffer. NOTE: re-reading from disk is destructive so we can do it only + when we assume nobody is changing the buffercache under our I/O and when + we think the disk contains more recent information than the buffercache. + The update == 1 pass marks the buffers we need to update, the update == 2 + pass does the actual I/O. */ +void invalidate_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) + return; + +#ifndef DDE_LINUX + invalidate_bh_lrus(); + invalidate_mapping_pages(mapping, 0, -1); +#endif +} + +/* + * Kick pdflush then try to free up some ZONE_NORMAL memory. + */ +static void free_more_memory(void) +{ + struct zone *zone; + int nid; + +#ifndef DDE_LINUX + wakeup_pdflush(1024); + yield(); + + for_each_online_node(nid) { + (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), + gfp_zone(GFP_NOFS), NULL, + &zone); + if (zone) + try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, + GFP_NOFS); + } +#else + WARN_UNIMPL; +#endif +} + +/* + * I/O completion handler for block_read_full_page() - pages + * which come unlocked at the end of I/O. + */ +static void end_buffer_async_read(struct buffer_head *bh, int uptodate) +{ + unsigned long flags; + struct buffer_head *first; + struct buffer_head *tmp; + struct page *page; + int page_uptodate = 1; + + BUG_ON(!buffer_async_read(bh)); + + page = bh->b_page; + if (uptodate) { + set_buffer_uptodate(bh); + } else { + clear_buffer_uptodate(bh); + if (!quiet_error(bh)) + buffer_io_error(bh); + SetPageError(page); + } + + /* + * Be _very_ careful from here on. Bad things can happen if + * two buffer heads end IO at almost the same time and both + * decide that the page is now completely done. + */ + first = page_buffers(page); + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; + do { + if (!buffer_uptodate(tmp)) + page_uptodate = 0; + if (buffer_async_read(tmp)) { + BUG_ON(!buffer_locked(tmp)); + goto still_busy; + } + tmp = tmp->b_this_page; + } while (tmp != bh); + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); + + /* + * If none of the buffers had errors and they are all + * uptodate then we can set the page uptodate. + */ + if (page_uptodate && !PageError(page)) + SetPageUptodate(page); + unlock_page(page); + return; + +still_busy: + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); + return; +} + +/* + * Completion handler for block_write_full_page() - pages which are unlocked + * during I/O, and which have PageWriteback cleared upon I/O completion. + */ +static void end_buffer_async_write(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + unsigned long flags; + struct buffer_head *first; + struct buffer_head *tmp; + struct page *page; + + BUG_ON(!buffer_async_write(bh)); + + page = bh->b_page; + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (!quiet_error(bh)) { + buffer_io_error(bh); + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + set_bit(AS_EIO, &page->mapping->flags); + set_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + SetPageError(page); + } + + first = page_buffers(page); + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + + clear_buffer_async_write(bh); + unlock_buffer(bh); + tmp = bh->b_this_page; + while (tmp != bh) { + if (buffer_async_write(tmp)) { + BUG_ON(!buffer_locked(tmp)); + goto still_busy; + } + tmp = tmp->b_this_page; + } + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); + end_page_writeback(page); + return; + +still_busy: + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); + return; +} + +/* + * If a page's buffers are under async readin (end_buffer_async_read + * completion) then there is a possibility that another thread of + * control could lock one of the buffers after it has completed + * but while some of the other buffers have not completed. This + * locked buffer would confuse end_buffer_async_read() into not unlocking + * the page. So the absence of BH_Async_Read tells end_buffer_async_read() + * that this buffer is not under async I/O. + * + * The page comes unlocked when it has no locked buffer_async buffers + * left. + * + * PageLocked prevents anyone starting new async I/O reads any of + * the buffers. + * + * PageWriteback is used to prevent simultaneous writeout of the same + * page. + * + * PageLocked prevents anyone from starting writeback of a page which is + * under read I/O (PageWriteback is only ever set against a locked page). + */ +static void mark_buffer_async_read(struct buffer_head *bh) +{ + bh->b_end_io = end_buffer_async_read; + set_buffer_async_read(bh); +} + +void mark_buffer_async_write(struct buffer_head *bh) +{ + bh->b_end_io = end_buffer_async_write; + set_buffer_async_write(bh); +} +EXPORT_SYMBOL(mark_buffer_async_write); + + +/* + * fs/buffer.c contains helper functions for buffer-backed address space's + * fsync functions. A common requirement for buffer-based filesystems is + * that certain data from the backing blockdev needs to be written out for + * a successful fsync(). For example, ext2 indirect blocks need to be + * written back and waited upon before fsync() returns. + * + * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), + * inode_has_buffers() and invalidate_inode_buffers() are provided for the + * management of a list of dependent buffers at ->i_mapping->private_list. + * + * Locking is a little subtle: try_to_free_buffers() will remove buffers + * from their controlling inode's queue when they are being freed. But + * try_to_free_buffers() will be operating against the *blockdev* mapping + * at the time, not against the S_ISREG file which depends on those buffers. + * So the locking for private_list is via the private_lock in the address_space + * which backs the buffers. Which is different from the address_space + * against which the buffers are listed. So for a particular address_space, + * mapping->private_lock does *not* protect mapping->private_list! In fact, + * mapping->private_list will always be protected by the backing blockdev's + * ->private_lock. + * + * Which introduces a requirement: all buffers on an address_space's + * ->private_list must be from the same address_space: the blockdev's. + * + * address_spaces which do not place buffers at ->private_list via these + * utility functions are free to use private_lock and private_list for + * whatever they want. The only requirement is that list_empty(private_list) + * be true at clear_inode() time. + * + * FIXME: clear_inode should not call invalidate_inode_buffers(). The + * filesystems should do that. invalidate_inode_buffers() should just go + * BUG_ON(!list_empty). + * + * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should + * take an address_space, not an inode. And it should be called + * mark_buffer_dirty_fsync() to clearly define why those buffers are being + * queued up. + * + * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the + * list if it is already on a list. Because if the buffer is on a list, + * it *must* already be on the right one. If not, the filesystem is being + * silly. This will save a ton of locking. But first we have to ensure + * that buffers are taken *off* the old inode's list when they are freed + * (presumably in truncate). That requires careful auditing of all + * filesystems (do it inside bforget()). It could also be done by bringing + * b_inode back. + */ + +/* + * The buffer's backing address_space's private_lock must be held + */ +static void __remove_assoc_queue(struct buffer_head *bh) +{ + list_del_init(&bh->b_assoc_buffers); + WARN_ON(!bh->b_assoc_map); + if (buffer_write_io_error(bh)) + set_bit(AS_EIO, &bh->b_assoc_map->flags); + bh->b_assoc_map = NULL; +} + +int inode_has_buffers(struct inode *inode) +{ + return !list_empty(&inode->i_data.private_list); +} + +/* + * osync is designed to support O_SYNC io. It waits synchronously for + * all already-submitted IO to complete, but does not queue any new + * writes to the disk. + * + * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as + * you dirty the buffers, and then use osync_inode_buffers to wait for + * completion. Any other dirty buffers which are not yet queued for + * write will not be flushed to disk by the osync. + */ +static int osync_buffers_list(spinlock_t *lock, struct list_head *list) +{ + struct buffer_head *bh; + struct list_head *p; + int err = 0; + + spin_lock(lock); +repeat: + list_for_each_prev(p, list) { + bh = BH_ENTRY(p); + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(lock); + goto repeat; + } + } + spin_unlock(lock); + return err; +} + +/** + * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers + * @mapping: the mapping which wants those buffers written + * + * Starts I/O against the buffers at mapping->private_list, and waits upon + * that I/O. + * + * Basically, this is a convenience function for fsync(). + * @mapping is a file or directory which needs those buffers to be written for + * a successful fsync(). + */ +int sync_mapping_buffers(struct address_space *mapping) +{ + struct address_space *buffer_mapping = mapping->assoc_mapping; + + if (buffer_mapping == NULL || list_empty(&mapping->private_list)) + return 0; + + return fsync_buffers_list(&buffer_mapping->private_lock, + &mapping->private_list); +} +EXPORT_SYMBOL(sync_mapping_buffers); + +/* + * Called when we've recently written block `bblock', and it is known that + * `bblock' was for a buffer_boundary() buffer. This means that the block at + * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's + * dirty, schedule it for IO. So that indirects merge nicely with their data. + */ +void write_boundary_block(struct block_device *bdev, + sector_t bblock, unsigned blocksize) +{ + struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); + if (bh) { + if (buffer_dirty(bh)) + ll_rw_block(WRITE, 1, &bh); + put_bh(bh); + } +} + +void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct address_space *buffer_mapping = bh->b_page->mapping; + + mark_buffer_dirty(bh); + if (!mapping->assoc_mapping) { + mapping->assoc_mapping = buffer_mapping; + } else { + BUG_ON(mapping->assoc_mapping != buffer_mapping); + } + if (!bh->b_assoc_map) { + spin_lock(&buffer_mapping->private_lock); + list_move_tail(&bh->b_assoc_buffers, + &mapping->private_list); + bh->b_assoc_map = mapping; + spin_unlock(&buffer_mapping->private_lock); + } +} +EXPORT_SYMBOL(mark_buffer_dirty_inode); + +/* + * Mark the page dirty, and set it dirty in the radix tree, and mark the inode + * dirty. + * + * If warn is true, then emit a warning if the page is not uptodate and has + * not been truncated. + */ +static void __set_page_dirty(struct page *page, + struct address_space *mapping, int warn) +{ + spin_lock_irq(&mapping->tree_lock); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(warn && !PageUptodate(page)); + + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); + task_dirty_inc(current); + task_io_account_write(PAGE_CACHE_SIZE); + } + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&mapping->tree_lock); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); +} + +/* + * Add a page to the dirty page list. + * + * It is a sad fact of life that this function is called from several places + * deeply under spinlocking. It may not sleep. + * + * If the page has buffers, the uptodate buffers are set dirty, to preserve + * dirty-state coherency between the page and the buffers. It the page does + * not have buffers then when they are later attached they will all be set + * dirty. + * + * The buffers are dirtied before the page is dirtied. There's a small race + * window in which a writepage caller may see the page cleanness but not the + * buffer dirtiness. That's fine. If this code were to set the page dirty + * before the buffers, a concurrent writepage caller could clear the page dirty + * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean + * page on the dirty page list. + * + * We use private_lock to lock against try_to_free_buffers while using the + * page's buffer list. Also use this to protect against clean buffers being + * added to the page after it was set dirty. + * + * FIXME: may need to call ->reservepage here as well. That's rather up to the + * address_space though. + */ +int __set_page_dirty_buffers(struct page *page) +{ + int newly_dirty; + struct address_space *mapping = page_mapping(page); + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + spin_lock(&mapping->private_lock); + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + + do { + set_buffer_dirty(bh); + bh = bh->b_this_page; + } while (bh != head); + } + newly_dirty = !TestSetPageDirty(page); + spin_unlock(&mapping->private_lock); + + if (newly_dirty) + __set_page_dirty(page, mapping, 1); + return newly_dirty; +} +EXPORT_SYMBOL(__set_page_dirty_buffers); + +/* + * Write out and wait upon a list of buffers. + * + * We have conflicting pressures: we want to make sure that all + * initially dirty buffers get waited on, but that any subsequently + * dirtied buffers don't. After all, we don't want fsync to last + * forever if somebody is actively writing to the file. + * + * Do this in two main stages: first we copy dirty buffers to a + * temporary inode list, queueing the writes as we go. Then we clean + * up, waiting for those writes to complete. + * + * During this second stage, any subsequent updates to the file may end + * up refiling the buffer on the original inode's dirty list again, so + * there is a chance we will end up with a buffer queued for write but + * not yet completed on that list. So, as a final cleanup we go through + * the osync code to catch these locked, dirty buffers without requeuing + * any newly dirty buffers for write. + */ +static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) +{ + struct buffer_head *bh; + struct list_head tmp; + struct address_space *mapping; + int err = 0, err2; + + INIT_LIST_HEAD(&tmp); + + spin_lock(lock); + while (!list_empty(list)) { + bh = BH_ENTRY(list->next); + mapping = bh->b_assoc_map; + __remove_assoc_queue(bh); + /* Avoid race with mark_buffer_dirty_inode() which does + * a lockless check and we rely on seeing the dirty bit */ + smp_mb(); + if (buffer_dirty(bh) || buffer_locked(bh)) { + list_add(&bh->b_assoc_buffers, &tmp); + bh->b_assoc_map = mapping; + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(lock); + /* + * Ensure any pending I/O completes so that + * ll_rw_block() actually writes the current + * contents - it is a noop if I/O is still in + * flight on potentially older contents. + */ + ll_rw_block(SWRITE_SYNC, 1, &bh); + brelse(bh); + spin_lock(lock); + } + } + } + + while (!list_empty(&tmp)) { + bh = BH_ENTRY(tmp.prev); + get_bh(bh); + mapping = bh->b_assoc_map; + __remove_assoc_queue(bh); + /* Avoid race with mark_buffer_dirty_inode() which does + * a lockless check and we rely on seeing the dirty bit */ + smp_mb(); + if (buffer_dirty(bh)) { + list_add(&bh->b_assoc_buffers, + &mapping->private_list); + bh->b_assoc_map = mapping; + } + spin_unlock(lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(lock); + } + + spin_unlock(lock); + err2 = osync_buffers_list(lock, list); + if (err) + return err; + else + return err2; +} + +/* + * Invalidate any and all dirty buffers on a given inode. We are + * probably unmounting the fs, but that doesn't mean we have already + * done a sync(). Just drop the buffers from the inode list. + * + * NOTE: we take the inode's blockdev's mapping's private_lock. Which + * assumes that all the buffers are against the blockdev. Not true + * for reiserfs. + */ +void invalidate_inode_buffers(struct inode *inode) +{ + if (inode_has_buffers(inode)) { + struct address_space *mapping = &inode->i_data; + struct list_head *list = &mapping->private_list; + struct address_space *buffer_mapping = mapping->assoc_mapping; + + spin_lock(&buffer_mapping->private_lock); + while (!list_empty(list)) + __remove_assoc_queue(BH_ENTRY(list->next)); + spin_unlock(&buffer_mapping->private_lock); + } +} +EXPORT_SYMBOL(invalidate_inode_buffers); + +/* + * Remove any clean buffers from the inode's buffer list. This is called + * when we're trying to free the inode itself. Those buffers can pin it. + * + * Returns true if all buffers were removed. + */ +int remove_inode_buffers(struct inode *inode) +{ + int ret = 1; + + if (inode_has_buffers(inode)) { + struct address_space *mapping = &inode->i_data; + struct list_head *list = &mapping->private_list; + struct address_space *buffer_mapping = mapping->assoc_mapping; + + spin_lock(&buffer_mapping->private_lock); + while (!list_empty(list)) { + struct buffer_head *bh = BH_ENTRY(list->next); + if (buffer_dirty(bh)) { + ret = 0; + break; + } + __remove_assoc_queue(bh); + } + spin_unlock(&buffer_mapping->private_lock); + } + return ret; +} + +/* + * Create the appropriate buffers when given a page for data area and + * the size of each buffer.. Use the bh->b_this_page linked list to + * follow the buffers created. Return NULL if unable to create more + * buffers. + * + * The retry flag is used to differentiate async IO (paging, swapping) + * which may not fail from ordinary buffer allocations. + */ +struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, + int retry) +{ + struct buffer_head *bh, *head; + long offset; + +try_again: + head = NULL; + offset = PAGE_SIZE; + while ((offset -= size) >= 0) { + bh = alloc_buffer_head(GFP_NOFS); + if (!bh) + goto no_grow; + + bh->b_bdev = NULL; + bh->b_this_page = head; + bh->b_blocknr = -1; + head = bh; + + bh->b_state = 0; + atomic_set(&bh->b_count, 0); + bh->b_private = NULL; + bh->b_size = size; + + /* Link the buffer to its page */ + set_bh_page(bh, page, offset); + + init_buffer(bh, NULL, NULL); + } + return head; +/* + * In case anything failed, we just free everything we got. + */ +no_grow: + if (head) { + do { + bh = head; + head = head->b_this_page; + free_buffer_head(bh); + } while (head); + } + + /* + * Return failure for non-async IO requests. Async IO requests + * are not allowed to fail, so we have to wait until buffer heads + * become available. But we don't want tasks sleeping with + * partially complete buffers, so all were released above. + */ + if (!retry) + return NULL; + + /* We're _really_ low on memory. Now we just + * wait for old buffer heads to become free due to + * finishing IO. Since this is an async request and + * the reserve list is empty, we're sure there are + * async buffer heads in use. + */ + free_more_memory(); + goto try_again; +} +EXPORT_SYMBOL_GPL(alloc_page_buffers); + +static inline void +link_dev_buffers(struct page *page, struct buffer_head *head) +{ + struct buffer_head *bh, *tail; + + bh = head; + do { + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + attach_page_buffers(page, head); +} + +/* + * Initialise the state of a blockdev page's buffers. + */ +static void +init_page_buffers(struct page *page, struct block_device *bdev, + sector_t block, int size) +{ + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + int uptodate = PageUptodate(page); + + do { + if (!buffer_mapped(bh)) { + init_buffer(bh, NULL, NULL); + bh->b_bdev = bdev; + bh->b_blocknr = block; + if (uptodate) + set_buffer_uptodate(bh); + set_buffer_mapped(bh); + } + block++; + bh = bh->b_this_page; + } while (bh != head); +} + +/* + * Create the page-cache page that contains the requested block. + * + * This is user purely for blockdev mappings. + */ +static struct page * +grow_dev_page(struct block_device *bdev, sector_t block, + pgoff_t index, int size) +{ + struct inode *inode = bdev->bd_inode; + struct page *page; + struct buffer_head *bh; + +#ifdef DDE_LINUX + WARN_UNIMPL; + return NULL; +#endif + + page = find_or_create_page(inode->i_mapping, index, + (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); + if (!page) + return NULL; + + BUG_ON(!PageLocked(page)); + + if (page_has_buffers(page)) { + bh = page_buffers(page); + if (bh->b_size == size) { + init_page_buffers(page, bdev, block, size); + return page; + } + if (!try_to_free_buffers(page)) + goto failed; + } + + /* + * Allocate some buffers for this page + */ + bh = alloc_page_buffers(page, size, 0); + if (!bh) + goto failed; + + /* + * Link the page to the buffers and initialise them. Take the + * lock to be atomic wrt __find_get_block(), which does not + * run under the page lock. + */ + spin_lock(&inode->i_mapping->private_lock); + link_dev_buffers(page, bh); + init_page_buffers(page, bdev, block, size); + spin_unlock(&inode->i_mapping->private_lock); + return page; + +failed: + BUG(); + unlock_page(page); + page_cache_release(page); + return NULL; +} + +/* + * Create buffers for the specified block device block's page. If + * that page was dirty, the buffers are set dirty also. + */ +static int +grow_buffers(struct block_device *bdev, sector_t block, int size) +{ + struct page *page; + pgoff_t index; + int sizebits; + + sizebits = -1; + do { + sizebits++; + } while ((size << sizebits) < PAGE_SIZE); + + index = block >> sizebits; + + /* + * Check for a block which wants to lie outside our maximum possible + * pagecache index. (this comparison is done using sector_t types). + */ + if (unlikely(index != block >> sizebits)) { + char b[BDEVNAME_SIZE]; + + printk(KERN_ERR "%s: requested out-of-range block %llu for " + "device %s\n", + __func__, (unsigned long long)block, + bdevname(bdev, b)); + return -EIO; + } + block = index << sizebits; + /* Create a page with the proper size buffers.. */ + page = grow_dev_page(bdev, block, index, size); + if (!page) + return 0; + unlock_page(page); + page_cache_release(page); + return 1; +} + +static struct buffer_head * +__getblk_slow(struct block_device *bdev, sector_t block, int size) +{ + /* Size must be multiple of hard sectorsize */ + if (unlikely(size & (bdev_hardsect_size(bdev)-1) || + (size < 512 || size > PAGE_SIZE))) { + printk(KERN_ERR "getblk(): invalid block size %d requested\n", + size); + printk(KERN_ERR "hardsect size: %d\n", + bdev_hardsect_size(bdev)); + + dump_stack(); + return NULL; + } + + for (;;) { + struct buffer_head * bh; + int ret; + + bh = __find_get_block(bdev, block, size); + if (bh) + return bh; + + ret = grow_buffers(bdev, block, size); + if (ret < 0) + return NULL; + if (ret == 0) + free_more_memory(); + } +} + +/* + * The relationship between dirty buffers and dirty pages: + * + * Whenever a page has any dirty buffers, the page's dirty bit is set, and + * the page is tagged dirty in its radix tree. + * + * At all times, the dirtiness of the buffers represents the dirtiness of + * subsections of the page. If the page has buffers, the page dirty bit is + * merely a hint about the true dirty state. + * + * When a page is set dirty in its entirety, all its buffers are marked dirty + * (if the page has buffers). + * + * When a buffer is marked dirty, its page is dirtied, but the page's other + * buffers are not. + * + * Also. When blockdev buffers are explicitly read with bread(), they + * individually become uptodate. But their backing page remains not + * uptodate - even if all of its buffers are uptodate. A subsequent + * block_read_full_page() against that page will discover all the uptodate + * buffers, will set the page uptodate and will perform no I/O. + */ + +/** + * mark_buffer_dirty - mark a buffer_head as needing writeout + * @bh: the buffer_head to mark dirty + * + * mark_buffer_dirty() will set the dirty bit against the buffer, then set its + * backing page dirty, then tag the page as dirty in its address_space's radix + * tree and then attach the address_space's inode to its superblock's dirty + * inode list. + * + * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mapping->tree_lock and the global inode_lock. + */ +void mark_buffer_dirty(struct buffer_head *bh) +{ +#ifndef DDE_LINUX + WARN_ON_ONCE(!buffer_uptodate(bh)); + + /* + * Very *carefully* optimize the it-is-already-dirty case. + * + * Don't let the final "is it dirty" escape to before we + * perhaps modified the buffer. + */ + if (buffer_dirty(bh)) { + smp_mb(); + if (buffer_dirty(bh)) + return; + } + + if (!test_set_buffer_dirty(bh)) { + struct page *page = bh->b_page; + if (!TestSetPageDirty(page)) + __set_page_dirty(page, page_mapping(page), 0); + } +#else + WARN_UNIMPL; +#endif +} + +/* + * Decrement a buffer_head's reference count. If all buffers against a page + * have zero reference count, are clean and unlocked, and if the page is clean + * and unlocked then try_to_free_buffers() may strip the buffers from the page + * in preparation for freeing it (sometimes, rarely, buffers are removed from + * a page but it ends up not being freed, and buffers may later be reattached). + */ +void __brelse(struct buffer_head * buf) +{ + if (atomic_read(&buf->b_count)) { + put_bh(buf); + return; + } + WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); +} + +/* + * bforget() is like brelse(), except it discards any + * potentially dirty data. + */ +void __bforget(struct buffer_head *bh) +{ + clear_buffer_dirty(bh); + if (bh->b_assoc_map) { + struct address_space *buffer_mapping = bh->b_page->mapping; + + spin_lock(&buffer_mapping->private_lock); + list_del_init(&bh->b_assoc_buffers); + bh->b_assoc_map = NULL; + spin_unlock(&buffer_mapping->private_lock); + } + __brelse(bh); +} + +static struct buffer_head *__bread_slow(struct buffer_head *bh) +{ + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } else { + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + } + brelse(bh); + return NULL; +} + +/* + * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). + * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their + * refcount elevated by one when they're in an LRU. A buffer can only appear + * once in a particular CPU's LRU. A single buffer can be present in multiple + * CPU's LRUs at the same time. + * + * This is a transparent caching front-end to sb_bread(), sb_getblk() and + * sb_find_get_block(). + * + * The LRUs themselves only need locking against invalidate_bh_lrus. We use + * a local interrupt disable for that. + */ + +#define BH_LRU_SIZE 8 + +struct bh_lru { + struct buffer_head *bhs[BH_LRU_SIZE]; +}; + +static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; + +#ifdef CONFIG_SMP +#define bh_lru_lock() local_irq_disable() +#define bh_lru_unlock() local_irq_enable() +#else +#define bh_lru_lock() preempt_disable() +#define bh_lru_unlock() preempt_enable() +#endif + +static inline void check_irqs_on(void) +{ +#ifdef irqs_disabled + BUG_ON(irqs_disabled()); +#endif +} + +/* + * The LRU management algorithm is dopey-but-simple. Sorry. + */ +static void bh_lru_install(struct buffer_head *bh) +{ + struct buffer_head *evictee = NULL; + struct bh_lru *lru; + + check_irqs_on(); + bh_lru_lock(); + lru = &__get_cpu_var(bh_lrus); + if (lru->bhs[0] != bh) { + struct buffer_head *bhs[BH_LRU_SIZE]; + int in; + int out = 0; + + get_bh(bh); + bhs[out++] = bh; + for (in = 0; in < BH_LRU_SIZE; in++) { + struct buffer_head *bh2 = lru->bhs[in]; + + if (bh2 == bh) { + __brelse(bh2); + } else { + if (out >= BH_LRU_SIZE) { + BUG_ON(evictee != NULL); + evictee = bh2; + } else { + bhs[out++] = bh2; + } + } + } + while (out < BH_LRU_SIZE) + bhs[out++] = NULL; + memcpy(lru->bhs, bhs, sizeof(bhs)); + } + bh_lru_unlock(); + + if (evictee) + __brelse(evictee); +} + +/* + * Look up the bh in this cpu's LRU. If it's there, move it to the head. + */ +static struct buffer_head * +lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *ret = NULL; + struct bh_lru *lru; + unsigned int i; + + check_irqs_on(); + bh_lru_lock(); + lru = &__get_cpu_var(bh_lrus); + for (i = 0; i < BH_LRU_SIZE; i++) { + struct buffer_head *bh = lru->bhs[i]; + + if (bh && bh->b_bdev == bdev && + bh->b_blocknr == block && bh->b_size == size) { + if (i) { + while (i) { + lru->bhs[i] = lru->bhs[i - 1]; + i--; + } + lru->bhs[0] = bh; + } + get_bh(bh); + ret = bh; + break; + } + } + bh_lru_unlock(); + return ret; +} + +/* + * Perform a pagecache lookup for the matching buffer. If it's there, refresh + * it in the LRU and mark it as accessed. If it is not present then return + * NULL + */ +struct buffer_head * +__find_get_block(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *bh = lookup_bh_lru(bdev, block, size); + + if (bh == NULL) { + bh = __find_get_block_slow(bdev, block); + if (bh) + bh_lru_install(bh); + } + if (bh) + touch_buffer(bh); + return bh; +} +EXPORT_SYMBOL(__find_get_block); + +/* + * __getblk will locate (and, if necessary, create) the buffer_head + * which corresponds to the passed block_device, block and size. The + * returned buffer has its reference count incremented. + * + * __getblk() cannot fail - it just keeps trying. If you pass it an + * illegal block number, __getblk() will happily return a buffer_head + * which represents the non-existent block. Very weird. + * + * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() + * attempt is failing. FIXME, perhaps? + */ +struct buffer_head * +__getblk(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *bh = __find_get_block(bdev, block, size); + + might_sleep(); + if (bh == NULL) + bh = __getblk_slow(bdev, block, size); + return bh; +} +EXPORT_SYMBOL(__getblk); + +/* + * Do async read-ahead on a buffer.. + */ +void __breadahead(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *bh = __getblk(bdev, block, size); + if (likely(bh)) { + ll_rw_block(READA, 1, &bh); + brelse(bh); + } +} +EXPORT_SYMBOL(__breadahead); + +/** + * __bread() - reads a specified block and returns the bh + * @bdev: the block_device to read from + * @block: number of block + * @size: size (in bytes) to read + * + * Reads a specified block, and returns buffer head that contains it. + * It returns NULL if the block was unreadable. + */ +struct buffer_head * +__bread(struct block_device *bdev, sector_t block, unsigned size) +{ + struct buffer_head *bh = __getblk(bdev, block, size); + + if (likely(bh) && !buffer_uptodate(bh)) + bh = __bread_slow(bh); + return bh; +} +EXPORT_SYMBOL(__bread); + +/* + * invalidate_bh_lrus() is called rarely - but not only at unmount. + * This doesn't race because it runs in each cpu either in irq + * or with preempt disabled. + */ +static void invalidate_bh_lru(void *arg) +{ + struct bh_lru *b = &get_cpu_var(bh_lrus); + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } + put_cpu_var(bh_lrus); +} + +void invalidate_bh_lrus(void) +{ +#ifndef DDE_LINUX + on_each_cpu(invalidate_bh_lru, NULL, 1); +#endif +} +EXPORT_SYMBOL_GPL(invalidate_bh_lrus); + +void set_bh_page(struct buffer_head *bh, + struct page *page, unsigned long offset) +{ + bh->b_page = page; + BUG_ON(offset >= PAGE_SIZE); + if (PageHighMem(page)) + /* + * This catches illegal uses and preserves the offset: + */ + bh->b_data = (char *)(0 + offset); + else + bh->b_data = page_address(page) + offset; +} +EXPORT_SYMBOL(set_bh_page); + +/* + * Called when truncating a buffer on a page completely. + */ +static void discard_buffer(struct buffer_head * bh) +{ + lock_buffer(bh); + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); + unlock_buffer(bh); +} + +/** + * block_invalidatepage - invalidate part of all of a buffer-backed page + * + * @page: the page which is affected + * @offset: the index of the truncation point + * + * block_invalidatepage() is called when all or part of the page has become + * invalidatedby a truncate operation. + * + * block_invalidatepage() does not have to release all buffers, but it must + * ensure that no dirty buffer is left outside @offset and that no I/O + * is underway against any of the blocks which are outside the truncation + * point. Because the caller is about to free (and possibly reuse) those + * blocks on-disk. + */ +void block_invalidatepage(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off) + discard_buffer(bh); + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * We release buffers only if the entire page is being invalidated. + * The get_block cached value has been unconditionally invalidated, + * so real IO is not possible anymore. + */ + if (offset == 0) + try_to_release_page(page, 0); +out: + return; +} +EXPORT_SYMBOL(block_invalidatepage); + +/* + * We attach and possibly dirty the buffers atomically wrt + * __set_page_dirty_buffers() via private_lock. try_to_free_buffers + * is already excluded via the page lock. + */ +void create_empty_buffers(struct page *page, + unsigned long blocksize, unsigned long b_state) +{ + struct buffer_head *bh, *head, *tail; + + head = alloc_page_buffers(page, blocksize, 1); + bh = head; + do { + bh->b_state |= b_state; + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + + spin_lock(&page->mapping->private_lock); + if (PageUptodate(page) || PageDirty(page)) { + bh = head; + do { + if (PageDirty(page)) + set_buffer_dirty(bh); + if (PageUptodate(page)) + set_buffer_uptodate(bh); + bh = bh->b_this_page; + } while (bh != head); + } + attach_page_buffers(page, head); + spin_unlock(&page->mapping->private_lock); +} +EXPORT_SYMBOL(create_empty_buffers); + +/* + * We are taking a block for data and we don't want any output from any + * buffer-cache aliases starting from return from that function and + * until the moment when something will explicitly mark the buffer + * dirty (hopefully that will not happen until we will free that block ;-) + * We don't even need to mark it not-uptodate - nobody can expect + * anything from a newly allocated buffer anyway. We used to used + * unmap_buffer() for such invalidation, but that was wrong. We definitely + * don't want to mark the alias unmapped, for example - it would confuse + * anyone who might pick it with bread() afterwards... + * + * Also.. Note that bforget() doesn't lock the buffer. So there can + * be writeout I/O going on against recently-freed buffers. We don't + * wait on that I/O in bforget() - it's more efficient to wait on the I/O + * only if we really need to. That happens here. + */ +void unmap_underlying_metadata(struct block_device *bdev, sector_t block) +{ + struct buffer_head *old_bh; + + might_sleep(); + + old_bh = __find_get_block_slow(bdev, block); + if (old_bh) { + clear_buffer_dirty(old_bh); + wait_on_buffer(old_bh); + clear_buffer_req(old_bh); + __brelse(old_bh); + } +} +EXPORT_SYMBOL(unmap_underlying_metadata); + +/* + * NOTE! All mapped/uptodate combinations are valid: + * + * Mapped Uptodate Meaning + * + * No No "unknown" - must do get_block() + * No Yes "hole" - zero-filled + * Yes No "allocated" - allocated on disk, not read in + * Yes Yes "valid" - allocated and up-to-date in memory. + * + * "Dirty" is valid only with the last case (mapped+uptodate). + */ + +/* + * While block_write_full_page is writing back the dirty buffers under + * the page lock, whoever dirtied the buffers may decide to clean them + * again at any time. We handle that by only looking at the buffer + * state inside lock_buffer(). + * + * If block_write_full_page() is called for regular writeback + * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a + * locked buffer. This only can happen if someone has written the buffer + * directly, with submit_bh(). At the address_space level PageWriteback + * prevents this contention from occurring. + */ +static int __block_write_full_page(struct inode *inode, struct page *page, + get_block_t *get_block, struct writeback_control *wbc) +{ + int err; + sector_t block; + sector_t last_block; + struct buffer_head *bh, *head; + const unsigned blocksize = 1 << inode->i_blkbits; + int nr_underway = 0; + + BUG_ON(!PageLocked(page)); + + last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; + + if (!page_has_buffers(page)) { + create_empty_buffers(page, blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + + /* + * Be very careful. We have no exclusion from __set_page_dirty_buffers + * here, and the (potentially unmapped) buffers may become dirty at + * any time. If a buffer becomes dirty here after we've inspected it + * then we just miss that fact, and the page stays dirty. + * + * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * handle that here by just cleaning them. + */ + + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + head = page_buffers(page); + bh = head; + + /* + * Get all the dirty buffers mapped to disk addresses and + * handle any aliases from the underlying blockdev's mapping. + */ + do { + if (block > last_block) { + /* + * mapped buffers outside i_size will occur, because + * this page can be outside i_size when there is a + * truncate in progress. + */ + /* + * The buffer was zeroed by block_write_full_page() + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && + buffer_dirty(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, block, bh, 1); + if (err) + goto recover; + clear_buffer_delay(bh); + if (buffer_new(bh)) { + /* blockdev mappings never come here */ + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + } + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + do { + if (!buffer_mapped(bh)) + continue; + /* + * If it's a fully non-blocking write attempt and we cannot + * lock the buffer then redirty the page. Note that this can + * potentially cause a busy-wait loop from pdflush and kswapd + * activity, but those code paths have their own higher-level + * throttling. + */ + if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + lock_buffer(bh); + } else if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + /* + * The page and its buffers are protected by PageWriteback(), so we can + * drop the bh refcounts early. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(WRITE, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + + err = 0; +done: + if (nr_underway == 0) { + /* + * The page was marked dirty, but the buffers were + * clean. Someone wrote them back by hand with + * ll_rw_block/submit_bh. A rare case. + */ + end_page_writeback(page); + + /* + * The page and buffer_heads can be released at any time from + * here on. + */ + } + return err; + +recover: + /* + * ENOSPC, or some other error. We may already have added some + * blocks to the file, so we need to write these out to avoid + * exposing stale data. + * The page is currently locked and not marked for writeback + */ + bh = head; + /* Recovery: lock and submit the mapped buffers */ + do { + if (buffer_mapped(bh) && buffer_dirty(bh) && + !buffer_delay(bh)) { + lock_buffer(bh); + mark_buffer_async_write(bh); + } else { + /* + * The buffer may have been set dirty during + * attachment to a dirty page. + */ + clear_buffer_dirty(bh); + } + } while ((bh = bh->b_this_page) != head); + SetPageError(page); + BUG_ON(PageWriteback(page)); + mapping_set_error(page->mapping, err); + set_page_writeback(page); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + clear_buffer_dirty(bh); + submit_bh(WRITE, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + goto done; +} + +/* + * If a page has any new buffers, zero them out here, and mark them uptodate + * and dirty so they'll be written out (in order to prevent uninitialised + * block data from leaking). And clear the new bit. + */ +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) +{ + unsigned int block_start, block_end; + struct buffer_head *head, *bh; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; + + bh = head = page_buffers(page); + block_start = 0; + do { + block_end = block_start + bh->b_size; + + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, size; + + start = max(from, block_start); + size = min(to, block_end) - start; + + zero_user(page, start, size); + set_buffer_uptodate(bh); + } + + clear_buffer_new(bh); + mark_buffer_dirty(bh); + } + } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} +EXPORT_SYMBOL(page_zero_new_buffers); + +static int __block_prepare_write(struct inode *inode, struct page *page, + unsigned from, unsigned to, get_block_t *get_block) +{ + unsigned block_start, block_end; + sector_t block; + int err = 0; + unsigned blocksize, bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; + + BUG_ON(!PageLocked(page)); + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > to); + + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + + bbits = inode->i_blkbits; + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + + for(bh = head, block_start = 0; bh != head || !block_start; + block++, block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } + continue; + } + if (buffer_new(bh)) + clear_buffer_new(bh); + if (!buffer_mapped(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, block, bh, 1); + if (err) + break; + if (buffer_new(bh)) { + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + if (PageUptodate(page)) { + clear_buffer_new(bh); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + continue; + } + if (block_end > to || block_start < from) + zero_user_segments(page, + to, block_end, + block_start, from); + continue; + } + } + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + continue; + } + if (!buffer_uptodate(bh) && !buffer_delay(bh) && + !buffer_unwritten(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + } + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + err = -EIO; + } + if (unlikely(err)) + page_zero_new_buffers(page, from, to); + return err; +} + +static int __block_commit_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + + blocksize = 1 << inode->i_blkbits; + + for(bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } + clear_buffer_new(bh); + } + + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return 0; +} + +/* + * block_write_begin takes care of the basic task of block allocation and + * bringing partial write blocks uptodate first. + * + * If *pagep is not NULL, then block_write_begin uses the locked page + * at *pagep rather than allocating its own. In this case, the page will + * not be unlocked or deallocated on failure. + */ +int block_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ +#ifndef DDE_LINUX + struct inode *inode = mapping->host; + int status = 0; + struct page *page; + pgoff_t index; + unsigned start, end; + int ownpage = 0; + + index = pos >> PAGE_CACHE_SHIFT; + start = pos & (PAGE_CACHE_SIZE - 1); + end = start + len; + + page = *pagep; + if (page == NULL) { + ownpage = 1; + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + status = -ENOMEM; + goto out; + } + *pagep = page; + } else + BUG_ON(!PageLocked(page)); + + status = __block_prepare_write(inode, page, start, end, get_block); + if (unlikely(status)) { + ClearPageUptodate(page); + + if (ownpage) { + unlock_page(page); + page_cache_release(page); + *pagep = NULL; + +#ifndef DDE_LINUX + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); +#endif + } + } + +out: + return status; +#else + WARN_UNIMPL; + return -1; +#endif +} +EXPORT_SYMBOL(block_write_begin); + +int block_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned start; + + start = pos & (PAGE_CACHE_SIZE - 1); + + if (unlikely(copied < len)) { + /* + * The buffers that were written will now be uptodate, so we + * don't have to worry about a readpage reading them and + * overwriting a partial write. However if we have encountered + * a short write and only partially written into a buffer, it + * will not be marked uptodate, so a readpage might come in and + * destroy our partial write. + * + * Do the simplest thing, and just treat any short write to a + * non uptodate page as a zero-length write, and force the + * caller to redo the whole thing. + */ + if (!PageUptodate(page)) + copied = 0; + + page_zero_new_buffers(page, start+copied, start+len); + } + flush_dcache_page(page); + + /* This could be a short (even 0-length) commit */ + __block_commit_write(inode, page, start, start+copied); + + return copied; +} +EXPORT_SYMBOL(block_write_end); + +int generic_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int i_size_changed = 0; + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + i_size_changed = 1; + } + + unlock_page(page); + page_cache_release(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + + return copied; +} +EXPORT_SYMBOL(generic_write_end); + +/* + * block_is_partially_uptodate checks whether buffers within a page are + * uptodate or not. + * + * Returns true if all buffers which correspond to a file portion + * we want to read are uptodate. + */ +int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, + unsigned long from) +{ + struct inode *inode = page->mapping->host; + unsigned block_start, block_end, blocksize; + unsigned to; + struct buffer_head *bh, *head; + int ret = 1; + + if (!page_has_buffers(page)) + return 0; + + blocksize = 1 << inode->i_blkbits; + to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); + to = from + to; + if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) + return 0; + + head = page_buffers(page); + bh = head; + block_start = 0; + do { + block_end = block_start + blocksize; + if (block_end > from && block_start < to) { + if (!buffer_uptodate(bh)) { + ret = 0; + break; + } + if (block_end >= to) + break; + } + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + + return ret; +} +EXPORT_SYMBOL(block_is_partially_uptodate); + +/* + * Generic "read page" function for block devices that have the normal + * get_block functionality. This is most of the block device filesystems. + * Reads the page asynchronously --- the unlock_buffer() and + * set/clear_buffer_uptodate() functions propagate buffer state into the + * page struct once IO has completed. + */ +int block_read_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + sector_t iblock, lblock; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize; + int nr, i; + int fully_mapped = 1; + + BUG_ON(!PageLocked(page)); + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + + iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; + bh = head; + nr = 0; + i = 0; + + do { + if (buffer_uptodate(bh)) + continue; + + if (!buffer_mapped(bh)) { + int err = 0; + + fully_mapped = 0; + if (iblock < lblock) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, iblock, bh, 0); + if (err) + SetPageError(page); + } + if (!buffer_mapped(bh)) { + zero_user(page, i * blocksize, blocksize); + if (!err) + set_buffer_uptodate(bh); + continue; + } + /* + * get_block() might have updated the buffer + * synchronously + */ + if (buffer_uptodate(bh)) + continue; + } + arr[nr++] = bh; + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + if (fully_mapped) + SetPageMappedToDisk(page); + + if (!nr) { + /* + * All buffers are uptodate - we can set the page uptodate + * as well. But not if get_block() returned an error. + */ + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + /* Stage two: lock the buffers */ + for (i = 0; i < nr; i++) { + bh = arr[i]; + lock_buffer(bh); + mark_buffer_async_read(bh); + } + + /* + * Stage 3: start the IO. Check for uptodateness + * inside the buffer lock in case another process reading + * the underlying blockdev brought it uptodate (the sct fix). + */ + for (i = 0; i < nr; i++) { + bh = arr[i]; + if (buffer_uptodate(bh)) + end_buffer_async_read(bh, 1); + else + submit_bh(READ, bh); + } + return 0; +} + +/* utility function for filesystems that need to do work on expanding + * truncates. Uses filesystem pagecache writes to allow the filesystem to + * deal with the hole. + */ +int generic_cont_expand_simple(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + void *fsdata; + unsigned long limit; + int err; + + err = -EFBIG; + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && size > (loff_t)limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (size > inode->i_sb->s_maxbytes) + goto out; + + err = pagecache_write_begin(NULL, mapping, size, 0, + AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, + &page, &fsdata); + if (err) + goto out; + + err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); + BUG_ON(err > 0); + +out: + return err; +} + +static int cont_expand_zero(struct file *file, struct address_space *mapping, + loff_t pos, loff_t *bytes) +{ + struct inode *inode = mapping->host; + unsigned blocksize = 1 << inode->i_blkbits; + struct page *page; + void *fsdata; + pgoff_t index, curidx; + loff_t curpos; + unsigned zerofrom, offset, len; + int err = 0; + + index = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; + + while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { + zerofrom = curpos & ~PAGE_CACHE_MASK; + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + len = PAGE_CACHE_SIZE - zerofrom; + + err = pagecache_write_begin(file, mapping, curpos, len, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (err) + goto out; + zero_user(page, zerofrom, len); + err = pagecache_write_end(file, mapping, curpos, len, len, + page, fsdata); + if (err < 0) + goto out; + BUG_ON(err != len); + err = 0; + + balance_dirty_pages_ratelimited(mapping); + } + + /* page covers the boundary, find the boundary offset */ + if (index == curidx) { + zerofrom = curpos & ~PAGE_CACHE_MASK; + /* if we will expand the thing last block will be filled */ + if (offset <= zerofrom) { + goto out; + } + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + len = offset - zerofrom; + + err = pagecache_write_begin(file, mapping, curpos, len, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (err) + goto out; + zero_user(page, zerofrom, len); + err = pagecache_write_end(file, mapping, curpos, len, len, + page, fsdata); + if (err < 0) + goto out; + BUG_ON(err != len); + err = 0; + } +out: + return err; +} + +/* + * For moronic filesystems that do not allow holes in file. + * We may have to extend the file. + */ +int cont_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block, loff_t *bytes) +{ + struct inode *inode = mapping->host; + unsigned blocksize = 1 << inode->i_blkbits; + unsigned zerofrom; + int err; + + err = cont_expand_zero(file, mapping, pos, bytes); + if (err) + goto out; + + zerofrom = *bytes & ~PAGE_CACHE_MASK; + if (pos+len > *bytes && zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + + *pagep = NULL; + err = block_write_begin(file, mapping, pos, len, + flags, pagep, fsdata, get_block); +out: + return err; +} + +int block_prepare_write(struct page *page, unsigned from, unsigned to, + get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + int err = __block_prepare_write(inode, page, from, to, get_block); + if (err) + ClearPageUptodate(page); + return err; +} + +int block_commit_write(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + __block_commit_write(inode,page,from,to); + return 0; +} + +/* + * block_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * vmtruncate() writes the inode size before removing pages, once we have the + * page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +int +block_page_mkwrite(struct vm_area_struct *vma, struct page *page, + get_block_t get_block) +{ + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + unsigned long end; + loff_t size; + int ret = -EINVAL; + + lock_page(page); + size = i_size_read(inode); + if ((page->mapping != inode->i_mapping) || + (page_offset(page) > size)) { + /* page got truncated out from underneath us */ + goto out_unlock; + } + + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) + end = size & ~PAGE_CACHE_MASK; + else + end = PAGE_CACHE_SIZE; + + ret = block_prepare_write(page, 0, end, get_block); + if (!ret) + ret = block_commit_write(page, 0, end); + +out_unlock: + unlock_page(page); + return ret; +} + +/* + * nobh_write_begin()'s prereads are special: the buffer_heads are freed + * immediately, while under the page lock. So it needs a special end_io + * handler which does not touch the bh after unlocking it. + */ +static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) +{ + __end_buffer_read_notouch(bh, uptodate); +} + +/* + * Attach the singly-linked list of buffers created by nobh_write_begin, to + * the page (converting it to circular linked list and taking care of page + * dirty races). + */ +static void attach_nobh_buffers(struct page *page, struct buffer_head *head) +{ + struct buffer_head *bh; + + BUG_ON(!PageLocked(page)); + + spin_lock(&page->mapping->private_lock); + bh = head; + do { + if (PageDirty(page)) + set_buffer_dirty(bh); + if (!bh->b_this_page) + bh->b_this_page = head; + bh = bh->b_this_page; + } while (bh != head); + attach_page_buffers(page, head); + spin_unlock(&page->mapping->private_lock); +} + +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + */ +int nobh_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + struct inode *inode = mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + struct buffer_head *head, *bh; + struct page *page; + pgoff_t index; + unsigned from, to; + unsigned block_in_page; + unsigned block_start, block_end; + sector_t block_in_file; + int nr_reads = 0; + int ret = 0; + int is_mapped_to_disk = 1; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + *fsdata = NULL; + + if (page_has_buffers(page)) { + unlock_page(page); + page_cache_release(page); + *pagep = NULL; + return block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, get_block); + } + + if (PageMappedToDisk(page)) + return 0; + + /* + * Allocate buffers so that we can keep track of state, and potentially + * attach them to the page if an error occurs. In the common case of + * no error, they will just be freed again without ever being attached + * to the page (which is all OK, because we're under the page lock). + * + * Be careful: the buffer linked list is a NULL terminated one, rather + * than the circular one we're used to. + */ + head = alloc_page_buffers(page, blocksize, 0); + if (!head) { + ret = -ENOMEM; + goto out_release; + } + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + + /* + * We loop across all blocks in the page, whether or not they are + * part of the affected region. This is so we can discover if the + * page is fully mapped-to-disk. + */ + for (block_start = 0, block_in_page = 0, bh = head; + block_start < PAGE_CACHE_SIZE; + block_in_page++, block_start += blocksize, bh = bh->b_this_page) { + int create; + + block_end = block_start + blocksize; + bh->b_state = 0; + create = 1; + if (block_start >= to) + create = 0; + ret = get_block(inode, block_in_file + block_in_page, + bh, create); + if (ret) + goto failed; + if (!buffer_mapped(bh)) + is_mapped_to_disk = 0; + if (buffer_new(bh)) + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + if (PageUptodate(page)) { + set_buffer_uptodate(bh); + continue; + } + if (buffer_new(bh) || !buffer_mapped(bh)) { + zero_user_segments(page, block_start, from, + to, block_end); + continue; + } + if (buffer_uptodate(bh)) + continue; /* reiserfs does this */ + if (block_start < from || block_end > to) { + lock_buffer(bh); + bh->b_end_io = end_buffer_read_nobh; + submit_bh(READ, bh); + nr_reads++; + } + } + + if (nr_reads) { + /* + * The page is locked, so these buffers are protected from + * any VM or truncate activity. Hence we don't need to care + * for the buffer_head refcounts. + */ + for (bh = head; bh; bh = bh->b_this_page) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + ret = -EIO; + } + if (ret) + goto failed; + } + + if (is_mapped_to_disk) + SetPageMappedToDisk(page); + + *fsdata = head; /* to be released by nobh_write_end */ + + return 0; + +failed: + BUG_ON(!ret); + /* + * Error recovery is a bit difficult. We need to zero out blocks that + * were newly allocated, and dirty them to ensure they get written out. + * Buffers need to be attached to the page at this point, otherwise + * the handling of potential IO errors during writeout would be hard + * (could try doing synchronous writeout, but what if that fails too?) + */ + attach_nobh_buffers(page, head); + page_zero_new_buffers(page, from, to); + +out_release: + unlock_page(page); + page_cache_release(page); + *pagep = NULL; + + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + + return ret; +} +EXPORT_SYMBOL(nobh_write_begin); + +int nobh_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *head = fsdata; + struct buffer_head *bh; + BUG_ON(fsdata != NULL && page_has_buffers(page)); + + if (unlikely(copied < len) && head) + attach_nobh_buffers(page, head); + if (page_has_buffers(page)) + return generic_write_end(file, mapping, pos, len, + copied, page, fsdata); + + SetPageUptodate(page); + set_page_dirty(page); + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + mark_inode_dirty(inode); + } + + unlock_page(page); + page_cache_release(page); + + while (head) { + bh = head; + head = head->b_this_page; + free_buffer_head(bh); + } + + return copied; +} +EXPORT_SYMBOL(nobh_write_end); + +/* + * nobh_writepage() - based on block_full_write_page() except + * that it tries to operate without attaching bufferheads to + * the page. + */ +int nobh_writepage(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + struct inode * const inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + int ret; + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + goto out; + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index >= end_index+1 || !offset) { + /* + * The page may have dirty, unmapped buffers. For example, + * they may have been added in ext3_writepage(). Make them + * freeable here, so the page does not leak. + */ +#if 0 + /* Not really sure about this - do we need this ? */ + if (page->mapping->a_ops->invalidatepage) + page->mapping->a_ops->invalidatepage(page, offset); +#endif + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); +out: + ret = mpage_writepage(page, get_block, wbc); + if (ret == -EAGAIN) + ret = __block_write_full_page(inode, page, get_block, wbc); + return ret; +} +EXPORT_SYMBOL(nobh_writepage); + +int nobh_truncate_page(struct address_space *mapping, + loff_t from, get_block_t *get_block) +{ + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize; + sector_t iblock; + unsigned length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head map_bh; + int err; + + blocksize = 1 << inode->i_blkbits; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (page_has_buffers(page)) { +has_buffers: + unlock_page(page); + page_cache_release(page); + return block_truncate_page(mapping, from, get_block); + } + + /* Find the buffer that contains "offset" */ + pos = blocksize; + while (offset >= pos) { + iblock++; + pos += blocksize; + } + + err = get_block(inode, iblock, &map_bh, 0); + if (err) + goto unlock; + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(&map_bh)) + goto unlock; + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (!PageUptodate(page)) { + err = mapping->a_ops->readpage(NULL, page); + if (err) { + page_cache_release(page); + goto out; + } + lock_page(page); + if (!PageUptodate(page)) { + err = -EIO; + goto unlock; + } + if (page_has_buffers(page)) + goto has_buffers; + } + zero_user(page, offset, length); + set_page_dirty(page); + err = 0; + +unlock: + unlock_page(page); + page_cache_release(page); +out: + return err; +} +EXPORT_SYMBOL(nobh_truncate_page); + +int block_truncate_page(struct address_space *mapping, + loff_t from, get_block_t *get_block) +{ + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize; + sector_t iblock; + unsigned length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head *bh; + int err; + + blocksize = 1 << inode->i_blkbits; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (!buffer_mapped(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, iblock, bh, 0); + if (err) + goto unlock; + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + zero_user(page, offset, length); + mark_buffer_dirty(bh); + err = 0; + +unlock: + unlock_page(page); + page_cache_release(page); +out: + return err; +} + +/* + * The generic ->writepage function for buffer-backed address_spaces + */ +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + struct inode * const inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + return __block_write_full_page(inode, page, get_block, wbc); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index >= end_index+1 || !offset) { + /* + * The page may have dirty, unmapped buffers. For example, + * they may have been added in ext3_writepage(). Make them + * freeable here, so the page does not leak. + */ + do_invalidatepage(page, 0); + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invokation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + return __block_write_full_page(inode, page, get_block, wbc); +} + +sector_t generic_block_bmap(struct address_space *mapping, sector_t block, + get_block_t *get_block) +{ + struct buffer_head tmp; + struct inode *inode = mapping->host; + tmp.b_state = 0; + tmp.b_blocknr = 0; + tmp.b_size = 1 << inode->i_blkbits; + get_block(inode, block, &tmp, 0); + return tmp.b_blocknr; +} + +static void end_bio_bh_io_sync(struct bio *bio, int err) +{ + struct buffer_head *bh = bio->bi_private; + + if (err == -EOPNOTSUPP) { + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + set_bit(BH_Eopnotsupp, &bh->b_state); + } + + if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) + set_bit(BH_Quiet, &bh->b_state); + + bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); + bio_put(bio); +} + +int submit_bh(int rw, struct buffer_head * bh) +{ + struct bio *bio; + int ret = 0; + + BUG_ON(!buffer_locked(bh)); + BUG_ON(!buffer_mapped(bh)); + BUG_ON(!bh->b_end_io); + + /* + * Mask in barrier bit for a write (could be either a WRITE or a + * WRITE_SYNC + */ + if (buffer_ordered(bh) && (rw & WRITE)) + rw |= WRITE_BARRIER; + + /* + * Only clear out a write error when rewriting + */ + if (test_set_buffer_req(bh) && (rw & WRITE)) + clear_buffer_write_io_error(bh); + + /* + * from here on down, it's all bio -- do the initial mapping, + * submit_bio -> generic_make_request may further map this bio around + */ + bio = bio_alloc(GFP_NOIO, 1); + + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio->bi_io_vec[0].bv_page = bh->b_page; + bio->bi_io_vec[0].bv_len = bh->b_size; + bio->bi_io_vec[0].bv_offset = bh_offset(bh); + + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = bh->b_size; + + bio->bi_end_io = end_bio_bh_io_sync; + bio->bi_private = bh; + + bio_get(bio); + submit_bio(rw, bio); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + + bio_put(bio); + return ret; +} + +/** + * ll_rw_block: low-level access to block devices (DEPRECATED) + * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) + * @nr: number of &struct buffer_heads in the array + * @bhs: array of pointers to &struct buffer_head + * + * ll_rw_block() takes an array of pointers to &struct buffer_heads, and + * requests an I/O operation on them, either a %READ or a %WRITE. The third + * %SWRITE is like %WRITE only we make sure that the *current* data in buffers + * are sent to disk. The fourth %READA option is described in the documentation + * for generic_make_request() which ll_rw_block() calls. + * + * This function drops any buffer that it cannot get a lock on (with the + * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be + * clean when doing a write request, and any buffer that appears to be + * up-to-date when doing read request. Further it marks as clean buffers that + * are processed for writing (the buffer cache won't assume that they are + * actually clean until the buffer gets unlocked). + * + * ll_rw_block sets b_end_io to simple completion handler that marks + * the buffer up-to-date (if approriate), unlocks the buffer and wakes + * any waiters. + * + * All of the buffers must be for the same device, and must also be a + * multiple of the current approved size for the device. + */ +void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) +{ + int i; + + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + + if (rw == SWRITE || rw == SWRITE_SYNC) + lock_buffer(bh); + else if (!trylock_buffer(bh)) + continue; + + if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { + if (test_clear_buffer_dirty(bh)) { + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + if (rw == SWRITE_SYNC) + submit_bh(WRITE_SYNC, bh); + else + submit_bh(WRITE, bh); + continue; + } + } else { + if (!buffer_uptodate(bh)) { + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(rw, bh); + continue; + } + } + unlock_buffer(bh); + } +} + +/* + * For a data-integrity writeout, we need to wait upon any in-progress I/O + * and then start new I/O and then wait upon it. The caller must have a ref on + * the buffer_head. + */ +int sync_dirty_buffer(struct buffer_head *bh) +{ + int ret = 0; + + WARN_ON(atomic_read(&bh->b_count) < 1); + lock_buffer(bh); + if (test_clear_buffer_dirty(bh)) { + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(WRITE, bh); + wait_on_buffer(bh); + if (buffer_eopnotsupp(bh)) { + clear_buffer_eopnotsupp(bh); + ret = -EOPNOTSUPP; + } + if (!ret && !buffer_uptodate(bh)) + ret = -EIO; + } else { + unlock_buffer(bh); + } + return ret; +} + +/* + * try_to_free_buffers() checks if all the buffers on this particular page + * are unused, and releases them if so. + * + * Exclusion against try_to_free_buffers may be obtained by either + * locking the page or by holding its mapping's private_lock. + * + * If the page is dirty but all the buffers are clean then we need to + * be sure to mark the page clean as well. This is because the page + * may be against a block device, and a later reattachment of buffers + * to a dirty page will set *all* buffers dirty. Which would corrupt + * filesystem data on the same device. + * + * The same applies to regular filesystem pages: if all the buffers are + * clean then we set the page clean and proceed. To do that, we require + * total exclusion from __set_page_dirty_buffers(). That is obtained with + * private_lock. + * + * try_to_free_buffers() is non-blocking. + */ +static inline int buffer_busy(struct buffer_head *bh) +{ + return atomic_read(&bh->b_count) | + (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); +} + +static int +drop_buffers(struct page *page, struct buffer_head **buffers_to_free) +{ + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh; + + bh = head; + do { + if (buffer_write_io_error(bh) && page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + if (buffer_busy(bh)) + goto failed; + bh = bh->b_this_page; + } while (bh != head); + + do { + struct buffer_head *next = bh->b_this_page; + + if (bh->b_assoc_map) + __remove_assoc_queue(bh); + bh = next; + } while (bh != head); + *buffers_to_free = head; + __clear_page_buffers(page); + return 1; +failed: + return 0; +} + +int try_to_free_buffers(struct page *page) +{ + struct address_space * const mapping = page->mapping; + struct buffer_head *buffers_to_free = NULL; + int ret = 0; + + BUG_ON(!PageLocked(page)); + if (PageWriteback(page)) + return 0; + + if (mapping == NULL) { /* can this still happen? */ + ret = drop_buffers(page, &buffers_to_free); + goto out; + } + + spin_lock(&mapping->private_lock); + ret = drop_buffers(page, &buffers_to_free); + + /* + * If the filesystem writes its buffers by hand (eg ext3) + * then we can have clean buffers against a dirty page. We + * clean the page here; otherwise the VM will never notice + * that the filesystem did any IO at all. + * + * Also, during truncate, discard_buffer will have marked all + * the page's buffers clean. We discover that here and clean + * the page also. + * + * private_lock must be held over this entire operation in order + * to synchronise against __set_page_dirty_buffers and prevent the + * dirty bit from being lost. + */ +#ifndef DDE_LINUX + if (ret) + cancel_dirty_page(page, PAGE_CACHE_SIZE); +#endif + spin_unlock(&mapping->private_lock); +out: + if (buffers_to_free) { + struct buffer_head *bh = buffers_to_free; + + do { + struct buffer_head *next = bh->b_this_page; + free_buffer_head(bh); + bh = next; + } while (bh != buffers_to_free); + } + return ret; +} +EXPORT_SYMBOL(try_to_free_buffers); + +void block_sync_page(struct page *page) +{ + struct address_space *mapping; + + smp_mb(); + mapping = page_mapping(page); + if (mapping) + blk_run_backing_dev(mapping->backing_dev_info, page); +} + +/* + * There are no bdflush tunables left. But distributions are + * still running obsolete flush daemons, so we terminate them here. + * + * Use of bdflush() is deprecated and will be removed in a future kernel. + * The `pdflush' kernel threads fully replace bdflush daemons and this call. + */ +SYSCALL_DEFINE2(bdflush, int, func, long, data) +{ + static int msg_count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (msg_count < 5) { + msg_count++; + printk(KERN_INFO + "warning: process `%s' used the obsolete bdflush" + " system call\n", current->comm); + printk(KERN_INFO "Fix your initscripts?\n"); + } + + if (func == 1) + do_exit(0); + return 0; +} + +/* + * Buffer-head allocation + */ +static struct kmem_cache *bh_cachep; + +/* + * Once the number of bh's in the machine exceeds this level, we start + * stripping them in writeback. + */ +static int max_buffer_heads; + +int buffer_heads_over_limit; + +struct bh_accounting { + int nr; /* Number of live bh's */ + int ratelimit; /* Limit cacheline bouncing */ +}; + +static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; + +static void recalc_bh_state(void) +{ + int i; + int tot = 0; + + if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) + return; + __get_cpu_var(bh_accounting).ratelimit = 0; + for_each_online_cpu(i) + tot += per_cpu(bh_accounting, i).nr; + buffer_heads_over_limit = (tot > max_buffer_heads); +} + +struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) +{ + struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); + if (ret) { + INIT_LIST_HEAD(&ret->b_assoc_buffers); + get_cpu_var(bh_accounting).nr++; + recalc_bh_state(); + put_cpu_var(bh_accounting); + } + return ret; +} +EXPORT_SYMBOL(alloc_buffer_head); + +void free_buffer_head(struct buffer_head *bh) +{ + BUG_ON(!list_empty(&bh->b_assoc_buffers)); + kmem_cache_free(bh_cachep, bh); + get_cpu_var(bh_accounting).nr--; + recalc_bh_state(); + put_cpu_var(bh_accounting); +} +EXPORT_SYMBOL(free_buffer_head); + +static void buffer_exit_cpu(int cpu) +{ + int i; + struct bh_lru *b = &per_cpu(bh_lrus, cpu); + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } + get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; + per_cpu(bh_accounting, cpu).nr = 0; + put_cpu_var(bh_accounting); +} + +static int buffer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) + buffer_exit_cpu((unsigned long)hcpu); + return NOTIFY_OK; +} + +/** + * bh_uptodate_or_lock - Test whether the buffer is uptodate + * @bh: struct buffer_head + * + * Return true if the buffer is up-to-date and false, + * with the buffer locked, if not. + */ +int bh_uptodate_or_lock(struct buffer_head *bh) +{ + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + if (!buffer_uptodate(bh)) + return 0; + unlock_buffer(bh); + } + return 1; +} +EXPORT_SYMBOL(bh_uptodate_or_lock); + +/** + * bh_submit_read - Submit a locked buffer for reading + * @bh: struct buffer_head + * + * Returns zero on success and -EIO on error. + */ +int bh_submit_read(struct buffer_head *bh) +{ + BUG_ON(!buffer_locked(bh)); + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return 0; + } + + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return 0; + return -EIO; +} +EXPORT_SYMBOL(bh_submit_read); + +static void +init_buffer_head(void *data) +{ + struct buffer_head *bh = data; + + memset(bh, 0, sizeof(*bh)); + INIT_LIST_HEAD(&bh->b_assoc_buffers); +} + +void __init buffer_init(void) +{ + int nrpages; + + bh_cachep = kmem_cache_create("buffer_head", + sizeof(struct buffer_head), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| + SLAB_MEM_SPREAD), + init_buffer_head); + + /* + * Limit the bh occupancy to 10% of ZONE_NORMAL + */ + nrpages = (nr_free_buffer_pages() * 10) / 100; + max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); + hotcpu_notifier(buffer_cpu_notify, 0); +} + +EXPORT_SYMBOL(__bforget); +EXPORT_SYMBOL(__brelse); +EXPORT_SYMBOL(__wait_on_buffer); +EXPORT_SYMBOL(block_commit_write); +EXPORT_SYMBOL(block_prepare_write); +EXPORT_SYMBOL(block_page_mkwrite); +EXPORT_SYMBOL(block_read_full_page); +EXPORT_SYMBOL(block_sync_page); +EXPORT_SYMBOL(block_truncate_page); +EXPORT_SYMBOL(block_write_full_page); +EXPORT_SYMBOL(cont_write_begin); +EXPORT_SYMBOL(end_buffer_read_sync); +EXPORT_SYMBOL(end_buffer_write_sync); +EXPORT_SYMBOL(file_fsync); +EXPORT_SYMBOL(fsync_bdev); +EXPORT_SYMBOL(generic_block_bmap); +EXPORT_SYMBOL(generic_cont_expand_simple); +EXPORT_SYMBOL(init_buffer); +EXPORT_SYMBOL(invalidate_bdev); +EXPORT_SYMBOL(ll_rw_block); +EXPORT_SYMBOL(mark_buffer_dirty); +EXPORT_SYMBOL(submit_bh); +EXPORT_SYMBOL(sync_dirty_buffer); +EXPORT_SYMBOL(unlock_buffer); diff --git a/libdde-linux26/lib/src/fs/char_dev.c b/libdde-linux26/lib/src/fs/char_dev.c new file mode 100644 index 00000000..3b8e8b3d --- /dev/null +++ b/libdde-linux26/lib/src/fs/char_dev.c @@ -0,0 +1,572 @@ +/* + * linux/fs/char_dev.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/kdev_t.h> +#include <linux/slab.h> +#include <linux/string.h> + +#include <linux/major.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/smp_lock.h> +#include <linux/seq_file.h> + +#include <linux/kobject.h> +#include <linux/kobj_map.h> +#include <linux/cdev.h> +#include <linux/mutex.h> +#include <linux/backing-dev.h> + +#ifdef CONFIG_KMOD +#include <linux/kmod.h> +#endif +#include "internal.h" + +#ifdef DDE_LINUX +#include "local.h" +#endif + +/* + * capabilities for /dev/mem, /dev/kmem and similar directly mappable character + * devices + * - permits shared-mmap for read, write and/or exec + * - does not permit private mmap in NOMMU mode (can't do COW) + * - no readahead or I/O queue unplugging required + */ +struct backing_dev_info directly_mappable_cdev_bdi = { + .capabilities = ( +#ifdef CONFIG_MMU + /* permit private copies of the data to be taken */ + BDI_CAP_MAP_COPY | +#endif + /* permit direct mmap, for read, write or exec */ + BDI_CAP_MAP_DIRECT | + BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP), +}; + +static struct kobj_map *cdev_map; + +static DEFINE_MUTEX(chrdevs_lock); + +static struct char_device_struct { + struct char_device_struct *next; + unsigned int major; + unsigned int baseminor; + int minorct; + char name[64]; + struct cdev *cdev; /* will die */ +} *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; + +/* index in the above */ +static inline int major_to_index(int major) +{ + return major % CHRDEV_MAJOR_HASH_SIZE; +} + +#ifdef CONFIG_PROC_FS + +void chrdev_show(struct seq_file *f, off_t offset) +{ + struct char_device_struct *cd; + + if (offset < CHRDEV_MAJOR_HASH_SIZE) { + mutex_lock(&chrdevs_lock); + for (cd = chrdevs[offset]; cd; cd = cd->next) + seq_printf(f, "%3d %s\n", cd->major, cd->name); + mutex_unlock(&chrdevs_lock); + } +} + +#endif /* CONFIG_PROC_FS */ + +/* + * Register a single major with a specified minor range. + * + * If major == 0 this functions will dynamically allocate a major and return + * its number. + * + * If major > 0 this function will attempt to reserve the passed range of + * minors and will return zero on success. + * + * Returns a -ve errno on failure. + */ +static struct char_device_struct * +__register_chrdev_region(unsigned int major, unsigned int baseminor, + int minorct, const char *name) +{ + struct char_device_struct *cd, **cp; + int ret = 0; + int i; + + cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); + if (cd == NULL) + return ERR_PTR(-ENOMEM); + + mutex_lock(&chrdevs_lock); + + /* temporary */ + if (major == 0) { + for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) { + if (chrdevs[i] == NULL) + break; + } + + if (i == 0) { + ret = -EBUSY; + goto out; + } + major = i; + ret = major; + } + + cd->major = major; + cd->baseminor = baseminor; + cd->minorct = minorct; + strlcpy(cd->name, name, sizeof(cd->name)); + + i = major_to_index(major); + + for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) + if ((*cp)->major > major || + ((*cp)->major == major && + (((*cp)->baseminor >= baseminor) || + ((*cp)->baseminor + (*cp)->minorct > baseminor)))) + break; + + /* Check for overlapping minor ranges. */ + if (*cp && (*cp)->major == major) { + int old_min = (*cp)->baseminor; + int old_max = (*cp)->baseminor + (*cp)->minorct - 1; + int new_min = baseminor; + int new_max = baseminor + minorct - 1; + + /* New driver overlaps from the left. */ + if (new_max >= old_min && new_max <= old_max) { + ret = -EBUSY; + goto out; + } + + /* New driver overlaps from the right. */ + if (new_min <= old_max && new_min >= old_min) { + ret = -EBUSY; + goto out; + } + } + + cd->next = *cp; + *cp = cd; + mutex_unlock(&chrdevs_lock); + return cd; +out: + mutex_unlock(&chrdevs_lock); + kfree(cd); + return ERR_PTR(ret); +} + +static struct char_device_struct * +__unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) +{ + struct char_device_struct *cd = NULL, **cp; + int i = major_to_index(major); + + mutex_lock(&chrdevs_lock); + for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) + if ((*cp)->major == major && + (*cp)->baseminor == baseminor && + (*cp)->minorct == minorct) + break; + if (*cp) { + cd = *cp; + *cp = cd->next; + } + mutex_unlock(&chrdevs_lock); + return cd; +} + +/** + * register_chrdev_region() - register a range of device numbers + * @from: the first in the desired range of device numbers; must include + * the major number. + * @count: the number of consecutive device numbers required + * @name: the name of the device or driver. + * + * Return value is zero on success, a negative error code on failure. + */ +int register_chrdev_region(dev_t from, unsigned count, const char *name) +{ + struct char_device_struct *cd; + dev_t to = from + count; + dev_t n, next; + + for (n = from; n < to; n = next) { + next = MKDEV(MAJOR(n)+1, 0); + if (next > to) + next = to; + cd = __register_chrdev_region(MAJOR(n), MINOR(n), + next - n, name); + if (IS_ERR(cd)) + goto fail; + } + return 0; +fail: + to = n; + for (n = from; n < to; n = next) { + next = MKDEV(MAJOR(n)+1, 0); + kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); + } + return PTR_ERR(cd); +} + +/** + * alloc_chrdev_region() - register a range of char device numbers + * @dev: output parameter for first assigned number + * @baseminor: first of the requested range of minor numbers + * @count: the number of minor numbers required + * @name: the name of the associated device or driver + * + * Allocates a range of char device numbers. The major number will be + * chosen dynamically, and returned (along with the first minor number) + * in @dev. Returns zero or a negative error code. + */ +int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, + const char *name) +{ + struct char_device_struct *cd; + cd = __register_chrdev_region(0, baseminor, count, name); + if (IS_ERR(cd)) + return PTR_ERR(cd); + *dev = MKDEV(cd->major, cd->baseminor); + return 0; +} + +/** + * register_chrdev() - Register a major number for character devices. + * @major: major device number or 0 for dynamic allocation + * @name: name of this range of devices + * @fops: file operations associated with this devices + * + * If @major == 0 this functions will dynamically allocate a major and return + * its number. + * + * If @major > 0 this function will attempt to reserve a device with the given + * major number and will return zero on success. + * + * Returns a -ve errno on failure. + * + * The name of this device has nothing to do with the name of the device in + * /dev. It only helps to keep track of the different owners of devices. If + * your module name has only one type of devices it's ok to use e.g. the name + * of the module here. + * + * This function registers a range of 256 minor numbers. The first minor number + * is 0. + */ +int register_chrdev(unsigned int major, const char *name, + const struct file_operations *fops) +{ + struct char_device_struct *cd; + struct cdev *cdev; + char *s; + int err = -ENOMEM; + + cd = __register_chrdev_region(major, 0, 256, name); + if (IS_ERR(cd)) + return PTR_ERR(cd); + + cdev = cdev_alloc(); + if (!cdev) + goto out2; + + cdev->owner = fops->owner; + cdev->ops = fops; + kobject_set_name(&cdev->kobj, "%s", name); + for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/')) + *s = '!'; + + err = cdev_add(cdev, MKDEV(cd->major, 0), 256); + if (err) + goto out; + + cd->cdev = cdev; + + return major ? 0 : cd->major; +out: + kobject_put(&cdev->kobj); +out2: + kfree(__unregister_chrdev_region(cd->major, 0, 256)); + return err; +} + +/** + * unregister_chrdev_region() - return a range of device numbers + * @from: the first in the range of numbers to unregister + * @count: the number of device numbers to unregister + * + * This function will unregister a range of @count device numbers, + * starting with @from. The caller should normally be the one who + * allocated those numbers in the first place... + */ +void unregister_chrdev_region(dev_t from, unsigned count) +{ + dev_t to = from + count; + dev_t n, next; + + for (n = from; n < to; n = next) { + next = MKDEV(MAJOR(n)+1, 0); + if (next > to) + next = to; + kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); + } +} + +void unregister_chrdev(unsigned int major, const char *name) +{ + struct char_device_struct *cd; + cd = __unregister_chrdev_region(major, 0, 256); + if (cd && cd->cdev) + cdev_del(cd->cdev); + kfree(cd); +} + +static DEFINE_SPINLOCK(cdev_lock); + +static struct kobject *cdev_get(struct cdev *p) +{ + struct module *owner = p->owner; + struct kobject *kobj; + + if (owner && !try_module_get(owner)) + return NULL; + kobj = kobject_get(&p->kobj); + if (!kobj) + module_put(owner); + return kobj; +} + +void cdev_put(struct cdev *p) +{ + if (p) { + struct module *owner = p->owner; + kobject_put(&p->kobj); + module_put(owner); + } +} + +/* + * Called every time a character special file is opened + */ +static int chrdev_open(struct inode *inode, struct file *filp) +{ + struct cdev *p; + struct cdev *new = NULL; + int ret = 0; + + spin_lock(&cdev_lock); + p = inode->i_cdev; + if (!p) { + struct kobject *kobj; + int idx; + spin_unlock(&cdev_lock); + kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); + if (!kobj) + return -ENXIO; + new = container_of(kobj, struct cdev, kobj); + spin_lock(&cdev_lock); + /* Check i_cdev again in case somebody beat us to it while + we dropped the lock. */ + p = inode->i_cdev; + if (!p) { + inode->i_cdev = p = new; + inode->i_cindex = idx; + list_add(&inode->i_devices, &p->list); + new = NULL; + } else if (!cdev_get(p)) + ret = -ENXIO; + } else if (!cdev_get(p)) + ret = -ENXIO; + spin_unlock(&cdev_lock); + cdev_put(new); + if (ret) + return ret; + + ret = -ENXIO; + filp->f_op = fops_get(p->ops); + if (!filp->f_op) + goto out_cdev_put; + + if (filp->f_op->open) { + ret = filp->f_op->open(inode,filp); + if (ret) + goto out_cdev_put; + } + + return 0; + + out_cdev_put: + cdev_put(p); + return ret; +} + +void cd_forget(struct inode *inode) +{ + spin_lock(&cdev_lock); + list_del_init(&inode->i_devices); + inode->i_cdev = NULL; + spin_unlock(&cdev_lock); +} + +static void cdev_purge(struct cdev *cdev) +{ + spin_lock(&cdev_lock); + while (!list_empty(&cdev->list)) { + struct inode *inode; + inode = container_of(cdev->list.next, struct inode, i_devices); + list_del_init(&inode->i_devices); + inode->i_cdev = NULL; + } + spin_unlock(&cdev_lock); +} + +/* + * Dummy default file-operations: the only thing this does + * is contain the open that then fills in the correct operations + * depending on the special file... + */ +const struct file_operations def_chr_fops = { + .open = chrdev_open, +}; + +static struct kobject *exact_match(dev_t dev, int *part, void *data) +{ + struct cdev *p = data; + return &p->kobj; +} + +static int exact_lock(dev_t dev, void *data) +{ + struct cdev *p = data; + return cdev_get(p) ? 0 : -1; +} + +/** + * cdev_add() - add a char device to the system + * @p: the cdev structure for the device + * @dev: the first device number for which this device is responsible + * @count: the number of consecutive minor numbers corresponding to this + * device + * + * cdev_add() adds the device represented by @p to the system, making it + * live immediately. A negative error code is returned on failure. + */ +int cdev_add(struct cdev *p, dev_t dev, unsigned count) +{ + p->dev = dev; + p->count = count; + return kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); +} + +static void cdev_unmap(dev_t dev, unsigned count) +{ + kobj_unmap(cdev_map, dev, count); +} + +/** + * cdev_del() - remove a cdev from the system + * @p: the cdev structure to be removed + * + * cdev_del() removes @p from the system, possibly freeing the structure + * itself. + */ +void cdev_del(struct cdev *p) +{ + cdev_unmap(p->dev, p->count); + kobject_put(&p->kobj); +} + + +static void cdev_default_release(struct kobject *kobj) +{ + struct cdev *p = container_of(kobj, struct cdev, kobj); + cdev_purge(p); +} + +static void cdev_dynamic_release(struct kobject *kobj) +{ + struct cdev *p = container_of(kobj, struct cdev, kobj); + cdev_purge(p); + kfree(p); +} + +static struct kobj_type ktype_cdev_default = { + .release = cdev_default_release, +}; + +static struct kobj_type ktype_cdev_dynamic = { + .release = cdev_dynamic_release, +}; + +/** + * cdev_alloc() - allocate a cdev structure + * + * Allocates and returns a cdev structure, or NULL on failure. + */ +struct cdev *cdev_alloc(void) +{ + struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); + if (p) { + INIT_LIST_HEAD(&p->list); + kobject_init(&p->kobj, &ktype_cdev_dynamic); + } + return p; +} + +/** + * cdev_init() - initialize a cdev structure + * @cdev: the structure to initialize + * @fops: the file_operations for this device + * + * Initializes @cdev, remembering @fops, making it ready to add to the + * system with cdev_add(). + */ +void cdev_init(struct cdev *cdev, const struct file_operations *fops) +{ + memset(cdev, 0, sizeof *cdev); + INIT_LIST_HEAD(&cdev->list); + kobject_init(&cdev->kobj, &ktype_cdev_default); + cdev->ops = fops; +} + +static struct kobject *base_probe(dev_t dev, int *part, void *data) +{ + if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) + /* Make old-style 2.4 aliases work */ + request_module("char-major-%d", MAJOR(dev)); + return NULL; +} + +void __init chrdev_init(void) +{ + cdev_map = kobj_map_init(base_probe, &chrdevs_lock); + bdi_init(&directly_mappable_cdev_bdi); +} + +#ifndef LIBINPUT +core_initcall(chrdev_init); +#endif + +/* Let modules do char dev stuff */ +EXPORT_SYMBOL(register_chrdev_region); +EXPORT_SYMBOL(unregister_chrdev_region); +EXPORT_SYMBOL(alloc_chrdev_region); +EXPORT_SYMBOL(cdev_init); +EXPORT_SYMBOL(cdev_alloc); +EXPORT_SYMBOL(cdev_del); +EXPORT_SYMBOL(cdev_add); +EXPORT_SYMBOL(register_chrdev); +EXPORT_SYMBOL(unregister_chrdev); +EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/libdde-linux26/lib/src/init/calibrate.c b/libdde-linux26/lib/src/init/calibrate.c new file mode 100644 index 00000000..36747582 --- /dev/null +++ b/libdde-linux26/lib/src/init/calibrate.c @@ -0,0 +1,204 @@ +/* calibrate.c: default delay calibration + * + * Excised from init/main.c + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/jiffies.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/timex.h> +#include <linux/smp.h> +#include <ddekit/timer.h> + +unsigned long loops_per_jiffy = (1<<12); + +unsigned long lpj_fine; +unsigned long preset_lpj; +static int __init lpj_setup(char *str) +{ + preset_lpj = simple_strtoul(str,NULL,0); + return 1; +} + +__setup("lpj=", lpj_setup); + +#ifdef ARCH_HAS_READ_CURRENT_TIMER + +/* This routine uses the read_current_timer() routine and gets the + * loops per jiffy directly, instead of guessing it using delay(). + * Also, this code tries to handle non-maskable asynchronous events + * (like SMIs) + */ +#define DELAY_CALIBRATION_TICKS ((HZ < 100) ? 1 : (HZ/100)) +#define MAX_DIRECT_CALIBRATION_RETRIES 5 + +static unsigned long __cpuinit calibrate_delay_direct(void) +{ + unsigned long pre_start, start, post_start; + unsigned long pre_end, end, post_end; + unsigned long start_jiffies; + unsigned long timer_rate_min, timer_rate_max; + unsigned long good_timer_sum = 0; + unsigned long good_timer_count = 0; + int i; + + /* TODO It's not a very good place to call this function + * as TSC is very platform-dependant but calibrate_delay_direct + * isn't that much. */ + use_tsc_delay (); + + if (read_current_timer(&pre_start) < 0 ) + return 0; + + /* + * A simple loop like + * while ( jiffies < start_jiffies+1) + * start = read_current_timer(); + * will not do. As we don't really know whether jiffy switch + * happened first or timer_value was read first. And some asynchronous + * event can happen between these two events introducing errors in lpj. + * + * So, we do + * 1. pre_start <- When we are sure that jiffy switch hasn't happened + * 2. check jiffy switch + * 3. start <- timer value before or after jiffy switch + * 4. post_start <- When we are sure that jiffy switch has happened + * + * Note, we don't know anything about order of 2 and 3. + * Now, by looking at post_start and pre_start difference, we can + * check whether any asynchronous event happened or not + */ + + for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) { + pre_start = 0; + read_current_timer(&start); + start_jiffies = jiffies; + while (jiffies <= (start_jiffies + 1)) { + pre_start = start; + read_current_timer(&start); + } + read_current_timer(&post_start); + + pre_end = 0; + end = post_start; + while (jiffies <= + (start_jiffies + 1 + DELAY_CALIBRATION_TICKS)) { + pre_end = end; + read_current_timer(&end); + } + read_current_timer(&post_end); + + timer_rate_max = (post_end - pre_start) / + DELAY_CALIBRATION_TICKS; + timer_rate_min = (pre_end - post_start) / + DELAY_CALIBRATION_TICKS; + + /* + * If the upper limit and lower limit of the timer_rate is + * >= 12.5% apart, redo calibration. + */ + if (pre_start != 0 && pre_end != 0 && + (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) { + good_timer_count++; + good_timer_sum += timer_rate_max; + } + } + + if (good_timer_count) + return (good_timer_sum/good_timer_count); + + printk(KERN_WARNING "calibrate_delay_direct() failed to get a good " + "estimate for loops_per_jiffy.\nProbably due to long platform interrupts. Consider using \"lpj=\" boot option.\n"); + return 0; +} +#else +static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;} +#endif + +/* + * This is the number of bits of precision for the loops_per_jiffy. Each + * bit takes on average 1.5/HZ seconds. This (like the original) is a little + * better than 1% + * For the boot cpu we can skip the delay calibration and assign it a value + * calculated based on the timer frequency. + * For the rest of the CPUs we cannot assume that the timer frequency is same as + * the cpu frequency, hence do the calibration for those. + */ +#define LPS_PREC 8 + +static unsigned long __cpuinit calibrate_delay_estimate(void) +{ + unsigned long ticks, loopbit; + int lps_precision = LPS_PREC; + int loops_per_jiffy; + + loops_per_jiffy = (1<<12); + + while ((loops_per_jiffy <<= 1) != 0) { + /* wait for "start of" clock tick */ + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + /* Go .. */ + ticks = jiffies; + __delay(loops_per_jiffy); + ticks = jiffies - ticks; + if (ticks) + break; + } + + /* + * Do a binary approximation to get loops_per_jiffy set to + * equal one clock (up to lps_precision bits) + */ + loops_per_jiffy >>= 1; + loopbit = loops_per_jiffy; + while (lps_precision-- && (loopbit >>= 1)) { + loops_per_jiffy |= loopbit; + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + ticks = jiffies; + __delay(loops_per_jiffy); + if (jiffies != ticks) /* longer than 1 tick */ + loops_per_jiffy &= ~loopbit; + } + return loops_per_jiffy; +} + +#define NRETRIES 3 + +void __cpuinit calibrate_delay(void) +{ + if (preset_lpj) { + loops_per_jiffy = preset_lpj; + printk(KERN_INFO + "Calibrating delay loop (skipped) preset value.. "); + } else if ((smp_processor_id() == 0) && lpj_fine) { + loops_per_jiffy = lpj_fine; + printk(KERN_INFO + "Calibrating delay loop (skipped), " + "value calculated using timer frequency.. "); + } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { + printk(KERN_INFO + "Calibrating delay using timer specific routine.. "); + } else { + int i; + unsigned long result; + + printk(KERN_INFO "Calibrating delay loop... "); + loops_per_jiffy = 0; + for (i = 0; i < NRETRIES; i++) { + result = calibrate_delay_estimate(); + if (result > loops_per_jiffy) + loops_per_jiffy = result; + } + + } + printk(KERN_CONT "%lu.%02lu BogoMIPS (lpj=%lu)\n", + loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy); +} + +core_initcall(calibrate_delay); diff --git a/libdde-linux26/lib/src/kernel/capability.c b/libdde-linux26/lib/src/kernel/capability.c new file mode 100644 index 00000000..c269aa7c --- /dev/null +++ b/libdde-linux26/lib/src/kernel/capability.c @@ -0,0 +1,323 @@ +/* + * linux/kernel/capability.c + * + * Copyright (C) 1997 Andrew Main <zefram@fysh.org> + * + * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org> + * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> + */ + +#include <linux/audit.h> +#include <linux/capability.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/security.h> +#include <linux/syscalls.h> +#include <linux/pid_namespace.h> +#include <asm/uaccess.h> +#include "cred-internals.h" + +#ifndef DDE_LINUX +/* + * This lock protects task->cap_* for all tasks including current. + * Locking rule: acquire this prior to tasklist_lock. + */ +static DEFINE_SPINLOCK(task_capability_lock); + +/* + * Leveraged for setting/resetting capabilities + */ + +const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; +const kernel_cap_t __cap_full_set = CAP_FULL_SET; +const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; + +EXPORT_SYMBOL(__cap_empty_set); +EXPORT_SYMBOL(__cap_full_set); +EXPORT_SYMBOL(__cap_init_eff_set); + +#ifdef CONFIG_SECURITY_FILE_CAPABILITIES +int file_caps_enabled = 1; + +static int __init file_caps_disable(char *str) +{ + file_caps_enabled = 0; + return 1; +} +__setup("no_file_caps", file_caps_disable); +#endif + +/* + * More recent versions of libcap are available from: + * + * http://www.kernel.org/pub/linux/libs/security/linux-privs/ + */ + +static void warn_legacy_capability_use(void) +{ + static int warned; + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses 32-bit capabilities" + " (legacy support in use)\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* + * Version 2 capabilities worked fine, but the linux/capability.h file + * that accompanied their introduction encouraged their use without + * the necessary user-space source code changes. As such, we have + * created a version 3 with equivalent functionality to version 2, but + * with a header change to protect legacy source code from using + * version 2 when it wanted to use version 1. If your system has code + * that trips the following warning, it is using version 2 specific + * capabilities and may be doing so insecurely. + * + * The remedy is to either upgrade your version of libcap (to 2.10+, + * if the application is linked against it), or recompile your + * application with modern kernel headers and this warning will go + * away. + */ + +static void warn_deprecated_v2(void) +{ + static int warned; + + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses deprecated v2" + " capabilities in a way that may be insecure.\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* + * Version check. Return the number of u32s in each capability flag + * array, or a negative value on error. + */ +static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) +{ + __u32 version; + + if (get_user(version, &header->version)) + return -EFAULT; + + switch (version) { + case _LINUX_CAPABILITY_VERSION_1: + warn_legacy_capability_use(); + *tocopy = _LINUX_CAPABILITY_U32S_1; + break; + case _LINUX_CAPABILITY_VERSION_2: + warn_deprecated_v2(); + /* + * fall through - v3 is otherwise equivalent to v2. + */ + case _LINUX_CAPABILITY_VERSION_3: + *tocopy = _LINUX_CAPABILITY_U32S_3; + break; + default: + if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) + return -EFAULT; + return -EINVAL; + } + + return 0; +} + +/* + * The only thing that can change the capabilities of the current + * process is the current process. As such, we can't be in this code + * at the same time as we are in the process of setting capabilities + * in this process. The net result is that we can limit our use of + * locks to when we are reading the caps of another process. + */ +static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, + kernel_cap_t *pIp, kernel_cap_t *pPp) +{ + int ret; + + if (pid && (pid != task_pid_vnr(current))) { + struct task_struct *target; + + read_lock(&tasklist_lock); + + target = find_task_by_vpid(pid); + if (!target) + ret = -ESRCH; + else + ret = security_capget(target, pEp, pIp, pPp); + + read_unlock(&tasklist_lock); + } else + ret = security_capget(current, pEp, pIp, pPp); + + return ret; +} + +/** + * sys_capget - get the capabilities of a given process. + * @header: pointer to struct that contains capability version and + * target pid data + * @dataptr: pointer to struct that contains the effective, permitted, + * and inheritable capabilities that are returned + * + * Returns 0 on success and < 0 on error. + */ +SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) +{ + int ret = 0; + pid_t pid; + unsigned tocopy; + kernel_cap_t pE, pI, pP; + + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; + + if (get_user(pid, &header->pid)) + return -EFAULT; + + if (pid < 0) + return -EINVAL; + + ret = cap_get_target_pid(pid, &pE, &pI, &pP); + if (!ret) { + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; + unsigned i; + + for (i = 0; i < tocopy; i++) { + kdata[i].effective = pE.cap[i]; + kdata[i].permitted = pP.cap[i]; + kdata[i].inheritable = pI.cap[i]; + } + + /* + * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, + * we silently drop the upper capabilities here. This + * has the effect of making older libcap + * implementations implicitly drop upper capability + * bits when they perform a: capget/modify/capset + * sequence. + * + * This behavior is considered fail-safe + * behavior. Upgrading the application to a newer + * version of libcap will enable access to the newer + * capabilities. + * + * An alternative would be to return an error here + * (-ERANGE), but that causes legacy applications to + * unexpectidly fail; the capget/modify/capset aborts + * before modification is attempted and the application + * fails. + */ + if (copy_to_user(dataptr, kdata, tocopy + * sizeof(struct __user_cap_data_struct))) { + return -EFAULT; + } + } + + return ret; +} + +/** + * sys_capset - set capabilities for a process or (*) a group of processes + * @header: pointer to struct that contains capability version and + * target pid data + * @data: pointer to struct that contains the effective, permitted, + * and inheritable capabilities + * + * Set capabilities for the current process only. The ability to any other + * process(es) has been deprecated and removed. + * + * The restrictions on setting capabilities are specified as: + * + * I: any raised capabilities must be a subset of the old permitted + * P: any raised capabilities must be a subset of the old permitted + * E: must be set to a subset of new permitted + * + * Returns 0 on success and < 0 on error. + */ +SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) +{ + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; + unsigned i, tocopy; + kernel_cap_t inheritable, permitted, effective; + struct cred *new; + int ret; + pid_t pid; + + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; + + if (get_user(pid, &header->pid)) + return -EFAULT; + + /* may only affect current now */ + if (pid != 0 && pid != task_pid_vnr(current)) + return -EPERM; + + if (copy_from_user(&kdata, data, + tocopy * sizeof(struct __user_cap_data_struct))) + return -EFAULT; + + for (i = 0; i < tocopy; i++) { + effective.cap[i] = kdata[i].effective; + permitted.cap[i] = kdata[i].permitted; + inheritable.cap[i] = kdata[i].inheritable; + } + while (i < _KERNEL_CAPABILITY_U32S) { + effective.cap[i] = 0; + permitted.cap[i] = 0; + inheritable.cap[i] = 0; + i++; + } + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = security_capset(new, current_cred(), + &effective, &inheritable, &permitted); + if (ret < 0) + goto error; + + audit_log_capset(pid, new, current_cred()); + + return commit_creds(new); + +error: + abort_creds(new); + return ret; +} +#endif /* !DDE_LINUX */ + +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +int capable(int cap) +{ + if (unlikely(!cap_valid(cap))) { + printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); + BUG(); + } + + if (security_capable(cap) == 0) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} +EXPORT_SYMBOL(capable); diff --git a/libdde-linux26/lib/src/kernel/cred-internals.h b/libdde-linux26/lib/src/kernel/cred-internals.h new file mode 100644 index 00000000..2dc4fc2d --- /dev/null +++ b/libdde-linux26/lib/src/kernel/cred-internals.h @@ -0,0 +1,21 @@ +/* Internal credentials stuff + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +/* + * user.c + */ +static inline void sched_switch_user(struct task_struct *p) +{ +#ifdef CONFIG_USER_SCHED + sched_move_task(p); +#endif /* CONFIG_USER_SCHED */ +} + diff --git a/libdde-linux26/lib/src/kernel/exit.c b/libdde-linux26/lib/src/kernel/exit.c new file mode 100644 index 00000000..703f9aab --- /dev/null +++ b/libdde-linux26/lib/src/kernel/exit.c @@ -0,0 +1,1850 @@ +/* + * linux/kernel/exit.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/capability.h> +#include <linux/completion.h> +#include <linux/personality.h> +#include <linux/tty.h> +#include <linux/mnt_namespace.h> +#include <linux/iocontext.h> +#include <linux/key.h> +#include <linux/security.h> +#include <linux/cpu.h> +#include <linux/acct.h> +#include <linux/tsacct_kern.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/binfmts.h> +#include <linux/nsproxy.h> +#include <linux/pid_namespace.h> +#include <linux/ptrace.h> +#include <linux/profile.h> +#include <linux/mount.h> +#include <linux/proc_fs.h> +#include <linux/kthread.h> +#include <linux/mempolicy.h> +#include <linux/taskstats_kern.h> +#include <linux/delayacct.h> +#include <linux/freezer.h> +#include <linux/cgroup.h> +#include <linux/syscalls.h> +#include <linux/signal.h> +#include <linux/posix-timers.h> +#include <linux/cn_proc.h> +#include <linux/mutex.h> +#include <linux/futex.h> +#include <linux/pipe_fs_i.h> +#include <linux/audit.h> /* for audit_free() */ +#include <linux/resource.h> +#include <linux/blkdev.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/tracehook.h> +#include <linux/init_task.h> +#include <trace/sched.h> + +#include <asm/uaccess.h> +#include <asm/unistd.h> +#include <asm/pgtable.h> +#include <asm/mmu_context.h> +#include "cred-internals.h" + +DEFINE_TRACE(sched_process_free); +DEFINE_TRACE(sched_process_exit); +DEFINE_TRACE(sched_process_wait); + +#ifndef DDE_LINUX +static void exit_mm(struct task_struct * tsk); + +static inline int task_detached(struct task_struct *p) +{ + return p->exit_signal == -1; +} + +static void __unhash_process(struct task_struct *p) +{ + nr_threads--; + detach_pid(p, PIDTYPE_PID); + if (thread_group_leader(p)) { + detach_pid(p, PIDTYPE_PGID); + detach_pid(p, PIDTYPE_SID); + + list_del_rcu(&p->tasks); + __get_cpu_var(process_counts)--; + } + list_del_rcu(&p->thread_group); + list_del_init(&p->sibling); +} + +/* + * This function expects the tasklist_lock write-locked. + */ +static void __exit_signal(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct sighand_struct *sighand; + + BUG_ON(!sig); + BUG_ON(!atomic_read(&sig->count)); + + sighand = rcu_dereference(tsk->sighand); + spin_lock(&sighand->siglock); + + posix_cpu_timers_exit(tsk); + if (atomic_dec_and_test(&sig->count)) + posix_cpu_timers_exit_group(tsk); + else { + /* + * If there is any task waiting for the group exit + * then notify it: + */ + if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) + wake_up_process(sig->group_exit_task); + + if (tsk == sig->curr_target) + sig->curr_target = next_thread(tsk); + /* + * Accumulate here the counters for all threads but the + * group leader as they die, so they can be added into + * the process-wide totals when those are taken. + * The group leader stays around as a zombie as long + * as there are other threads. When it gets reaped, + * the exit.c code will add its counts into these totals. + * We won't ever get here for the group leader, since it + * will have been the last reference on the signal_struct. + */ + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); + sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; + sig = NULL; /* Marker for below. */ + } + + __unhash_process(tsk); + + /* + * Do this under ->siglock, we can race with another thread + * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + */ + flush_sigqueue(&tsk->pending); + + tsk->signal = NULL; + tsk->sighand = NULL; + spin_unlock(&sighand->siglock); + + __cleanup_sighand(sighand); + clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + if (sig) { + flush_sigqueue(&sig->shared_pending); + taskstats_tgid_free(sig); + /* + * Make sure ->signal can't go away under rq->lock, + * see account_group_exec_runtime(). + */ + task_rq_unlock_wait(tsk); + __cleanup_signal(sig); + } +} + +static void delayed_put_task_struct(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + trace_sched_process_free(tsk); + put_task_struct(tsk); +} + + +void release_task(struct task_struct * p) +{ + struct task_struct *leader; + int zap_leader; +repeat: + tracehook_prepare_release_task(p); + /* don't need to get the RCU readlock here - the process is dead and + * can't be modifying its own credentials */ + atomic_dec(&__task_cred(p)->user->processes); + + proc_flush_task(p); + write_lock_irq(&tasklist_lock); + tracehook_finish_release_task(p); + __exit_signal(p); + + /* + * If we are the last non-leader member of the thread + * group, and the leader is zombie, then notify the + * group leader's parent process. (if it wants notification.) + */ + zap_leader = 0; + leader = p->group_leader; + if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + BUG_ON(task_detached(leader)); + do_notify_parent(leader, leader->exit_signal); + /* + * If we were the last child thread and the leader has + * exited already, and the leader's parent ignores SIGCHLD, + * then we are the one who should release the leader. + * + * do_notify_parent() will have marked it self-reaping in + * that case. + */ + zap_leader = task_detached(leader); + + /* + * This maintains the invariant that release_task() + * only runs on a task in EXIT_DEAD, just for sanity. + */ + if (zap_leader) + leader->exit_state = EXIT_DEAD; + } + + write_unlock_irq(&tasklist_lock); + release_thread(p); + call_rcu(&p->rcu, delayed_put_task_struct); + + p = leader; + if (unlikely(zap_leader)) + goto repeat; +} + +/* + * This checks not only the pgrp, but falls back on the pid if no + * satisfactory pgrp is found. I dunno - gdb doesn't work correctly + * without this... + * + * The caller must hold rcu lock or the tasklist lock. + */ +struct pid *session_of_pgrp(struct pid *pgrp) +{ + struct task_struct *p; + struct pid *sid = NULL; + + p = pid_task(pgrp, PIDTYPE_PGID); + if (p == NULL) + p = pid_task(pgrp, PIDTYPE_PID); + if (p != NULL) + sid = task_session(p); + + return sid; +} + +/* + * Determine if a process group is "orphaned", according to the POSIX + * definition in 2.2.2.52. Orphaned process groups are not to be affected + * by terminal-generated stop signals. Newly orphaned process groups are + * to receive a SIGHUP and a SIGCONT. + * + * "I ask you, have you ever known what it is to be an orphan?" + */ +static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +{ + struct task_struct *p; + + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + if ((p == ignored_task) || + (p->exit_state && thread_group_empty(p)) || + is_global_init(p->real_parent)) + continue; + + if (task_pgrp(p->real_parent) != pgrp && + task_session(p->real_parent) == task_session(p)) + return 0; + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + + return 1; +} + +int is_current_pgrp_orphaned(void) +{ + int retval; + + read_lock(&tasklist_lock); + retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); + read_unlock(&tasklist_lock); + + return retval; +} + +static int has_stopped_jobs(struct pid *pgrp) +{ + int retval = 0; + struct task_struct *p; + + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + if (!task_is_stopped(p)) + continue; + retval = 1; + break; + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + return retval; +} + +/* + * Check to see if any process groups have become orphaned as + * a result of our exiting, and if they have any stopped jobs, + * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void +kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) +{ + struct pid *pgrp = task_pgrp(tsk); + struct task_struct *ignored_task = tsk; + + if (!parent) + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ + parent = tsk->real_parent; + else + /* reparent: our child is in a different pgrp than + * we are, and it was the only connection outside. + */ + ignored_task = NULL; + + if (task_pgrp(parent) != pgrp && + task_session(parent) == task_session(tsk) && + will_become_orphaned_pgrp(pgrp, ignored_task) && + has_stopped_jobs(pgrp)) { + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); + } +} + +/** + * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd + * + * If a kernel thread is launched as a result of a system call, or if + * it ever exits, it should generally reparent itself to kthreadd so it + * isn't in the way of other processes and is correctly cleaned up on exit. + * + * The various task state such as scheduling policy and priority may have + * been inherited from a user process, so we reset them to sane values here. + * + * NOTE that reparent_to_kthreadd() gives the caller full capabilities. + */ +static void reparent_to_kthreadd(void) +{ + write_lock_irq(&tasklist_lock); + + ptrace_unlink(current); + /* Reparent to init */ + current->real_parent = current->parent = kthreadd_task; + list_move_tail(¤t->sibling, ¤t->real_parent->children); + + /* Set the exit signal to SIGCHLD so we signal init on exit */ + current->exit_signal = SIGCHLD; + + if (task_nice(current) < 0) + set_user_nice(current, 0); + /* cpus_allowed? */ + /* rt_priority? */ + /* signals? */ + memcpy(current->signal->rlim, init_task.signal->rlim, + sizeof(current->signal->rlim)); + +#ifndef DDE_LINUX + atomic_inc(&init_cred.usage); + commit_creds(&init_cred); +#endif + write_unlock_irq(&tasklist_lock); +} + +void __set_special_pids(struct pid *pid) +{ + struct task_struct *curr = current->group_leader; + pid_t nr = pid_nr(pid); + + if (task_session(curr) != pid) { + change_pid(curr, PIDTYPE_SID, pid); + set_task_session(curr, nr); + } + if (task_pgrp(curr) != pid) { + change_pid(curr, PIDTYPE_PGID, pid); + set_task_pgrp(curr, nr); + } +} + +static void set_special_pids(struct pid *pid) +{ + write_lock_irq(&tasklist_lock); + __set_special_pids(pid); + write_unlock_irq(&tasklist_lock); +} + +/* + * Let kernel threads use this to say that they + * allow a certain signal (since daemonize() will + * have disabled all of them by default). + */ +int allow_signal(int sig) +{ + if (!valid_signal(sig) || sig < 1) + return -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + sigdelset(¤t->blocked, sig); + if (!current->mm) { + /* Kernel threads handle their own signals. + Let the signal code know it'll be handled, so + that they don't get converted to SIGKILL or + just silently dropped */ + current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; + } + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +EXPORT_SYMBOL(allow_signal); + +int disallow_signal(int sig) +{ + if (!valid_signal(sig) || sig < 1) + return -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +EXPORT_SYMBOL(disallow_signal); + +/* + * Put all the gunge required to become a kernel thread without + * attached user resources in one place where it belongs. + */ + +void daemonize(const char *name, ...) +{ + va_list args; + struct fs_struct *fs; + sigset_t blocked; + + va_start(args, name); + vsnprintf(current->comm, sizeof(current->comm), name, args); + va_end(args); + + /* + * If we were started as result of loading a module, close all of the + * user space pages. We don't need them, and if we didn't close them + * they would be locked into memory. + */ + exit_mm(current); + /* + * We don't want to have TIF_FREEZE set if the system-wide hibernation + * or suspend transition begins right now. + */ + current->flags |= (PF_NOFREEZE | PF_KTHREAD); + + if (current->nsproxy != &init_nsproxy) { + get_nsproxy(&init_nsproxy); + switch_task_namespaces(current, &init_nsproxy); + } + set_special_pids(&init_struct_pid); + proc_clear_tty(current); + + /* Block and flush all signals */ + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + /* Become as one with the init task */ + + exit_fs(current); /* current->fs->count--; */ + fs = init_task.fs; + current->fs = fs; + atomic_inc(&fs->count); + + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); + + reparent_to_kthreadd(); +} + +EXPORT_SYMBOL(daemonize); + +static void close_files(struct files_struct * files) +{ + int i, j; + struct fdtable *fdt; + + j = 0; + + /* + * It is safe to dereference the fd table without RCU or + * ->file_lock because this is the last reference to the + * files structure. + */ + fdt = files_fdtable(files); + for (;;) { + unsigned long set; + i = j * __NFDBITS; + if (i >= fdt->max_fds) + break; + set = fdt->open_fds->fds_bits[j++]; + while (set) { + if (set & 1) { + struct file * file = xchg(&fdt->fd[i], NULL); + if (file) { + filp_close(file, files); + cond_resched(); + } + } + i++; + set >>= 1; + } + } +} + +struct files_struct *get_files_struct(struct task_struct *task) +{ + struct files_struct *files; + + task_lock(task); + files = task->files; + if (files) + atomic_inc(&files->count); + task_unlock(task); + + return files; +} + +void put_files_struct(struct files_struct *files) +{ + struct fdtable *fdt; + + if (atomic_dec_and_test(&files->count)) { + close_files(files); + /* + * Free the fd and fdset arrays if we expanded them. + * If the fdtable was embedded, pass files for freeing + * at the end of the RCU grace period. Otherwise, + * you can free files immediately. + */ + fdt = files_fdtable(files); + if (fdt != &files->fdtab) + kmem_cache_free(files_cachep, files); + free_fdtable(fdt); + } +} + +void reset_files_struct(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct files_struct *old; + + old = tsk->files; + task_lock(tsk); + tsk->files = files; + task_unlock(tsk); + put_files_struct(old); +} + +void exit_files(struct task_struct *tsk) +{ + struct files_struct * files = tsk->files; + + if (files) { + task_lock(tsk); + tsk->files = NULL; + task_unlock(tsk); + put_files_struct(files); + } +} + +void put_fs_struct(struct fs_struct *fs) +{ + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { + path_put(&fs->root); + path_put(&fs->pwd); + kmem_cache_free(fs_cachep, fs); + } +} + +void exit_fs(struct task_struct *tsk) +{ + struct fs_struct * fs = tsk->fs; + + if (fs) { + task_lock(tsk); + tsk->fs = NULL; + task_unlock(tsk); + put_fs_struct(fs); + } +} + +EXPORT_SYMBOL_GPL(exit_fs); + +#ifdef CONFIG_MM_OWNER +/* + * Task p is exiting and it owned mm, lets find a new owner for it + */ +static inline int +mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) +{ + /* + * If there are other users of the mm and the owner (us) is exiting + * we need to find a new owner to take on the responsibility. + */ + if (atomic_read(&mm->mm_users) <= 1) + return 0; + if (mm->owner != p) + return 0; + return 1; +} + +void mm_update_next_owner(struct mm_struct *mm) +{ + struct task_struct *c, *g, *p = current; + +retry: + if (!mm_need_new_owner(mm, p)) + return; + + read_lock(&tasklist_lock); + /* + * Search in the children + */ + list_for_each_entry(c, &p->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search in the siblings + */ + list_for_each_entry(c, &p->parent->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search through everything else. We should not get + * here often + */ + do_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + } while_each_thread(g, c); + + read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL. + */ + mm->owner = NULL; + return; + +assign_new_owner: + BUG_ON(c == p); + get_task_struct(c); + /* + * The task_lock protects c->mm from changing. + * We always want mm->owner->mm == mm + */ + task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); + if (c->mm != mm) { + task_unlock(c); + put_task_struct(c); + goto retry; + } + mm->owner = c; + task_unlock(c); + put_task_struct(c); +} +#endif /* CONFIG_MM_OWNER */ + +/* + * Turn us into a lazy TLB process if we + * aren't already.. + */ +static void exit_mm(struct task_struct * tsk) +{ + struct mm_struct *mm = tsk->mm; + struct core_state *core_state; + + mm_release(tsk, mm); + if (!mm) + return; + /* + * Serialize with any possible pending coredump. + * We must hold mmap_sem around checking core_state + * and clearing tsk->mm. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group with ->mm != NULL. + */ + down_read(&mm->mmap_sem); + core_state = mm->core_state; + if (core_state) { + struct core_thread self; + up_read(&mm->mmap_sem); + + self.task = tsk; + self.next = xchg(&core_state->dumper.next, &self); + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); + + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_task_state(tsk, TASK_RUNNING); + down_read(&mm->mmap_sem); + } + atomic_inc(&mm->mm_count); + BUG_ON(mm != tsk->active_mm); + /* more a memory barrier than a real lock */ + task_lock(tsk); + tsk->mm = NULL; + up_read(&mm->mmap_sem); + enter_lazy_tlb(mm, current); + /* We don't want this task to be frozen prematurely */ + clear_freeze_flag(tsk); + task_unlock(tsk); + mm_update_next_owner(mm); + mmput(mm); +} + +/* + * Return nonzero if @parent's children should reap themselves. + * + * Called with write_lock_irq(&tasklist_lock) held. + */ +static int ignoring_children(struct task_struct *parent) +{ + int ret; + struct sighand_struct *psig = parent->sighand; + unsigned long flags; + spin_lock_irqsave(&psig->siglock, flags); + ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || + (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT)); + spin_unlock_irqrestore(&psig->siglock, flags); + return ret; +} + +/* + * Detach all tasks we were using ptrace on. + * Any that need to be release_task'd are put on the @dead list. + * + * Called with write_lock(&tasklist_lock) held. + */ +static void ptrace_exit(struct task_struct *parent, struct list_head *dead) +{ + struct task_struct *p, *n; + int ign = -1; + + list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { + __ptrace_unlink(p); + + if (p->exit_state != EXIT_ZOMBIE) + continue; + + /* + * If it's a zombie, our attachedness prevented normal + * parent notification or self-reaping. Do notification + * now if it would have happened earlier. If it should + * reap itself, add it to the @dead list. We can't call + * release_task() here because we already hold tasklist_lock. + * + * If it's our own child, there is no notification to do. + * But if our normal children self-reap, then this child + * was prevented by ptrace and we must reap it now. + */ + if (!task_detached(p) && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, parent)) + do_notify_parent(p, p->exit_signal); + else { + if (ign < 0) + ign = ignoring_children(parent); + if (ign) + p->exit_signal = -1; + } + } + + if (task_detached(p)) { + /* + * Mark it as in the process of being reaped. + */ + p->exit_state = EXIT_DEAD; + list_add(&p->ptrace_entry, dead); + } + } +} + +/* + * Finish up exit-time ptrace cleanup. + * + * Called without locks. + */ +static void ptrace_exit_finish(struct task_struct *parent, + struct list_head *dead) +{ + struct task_struct *p, *n; + + BUG_ON(!list_empty(&parent->ptraced)); + + list_for_each_entry_safe(p, n, dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } +} + +static void reparent_thread(struct task_struct *p, struct task_struct *father) +{ + if (p->pdeath_signal) + /* We already hold the tasklist_lock here. */ + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + + list_move_tail(&p->sibling, &p->real_parent->children); + + /* If this is a threaded reparent there is no need to + * notify anyone anything has happened. + */ + if (same_thread_group(p->real_parent, father)) + return; + + /* We don't want people slaying init. */ + if (!task_detached(p)) + p->exit_signal = SIGCHLD; + + /* If we'd notified the old parent about this child's death, + * also notify the new parent. + */ + if (!ptrace_reparented(p) && + p->exit_state == EXIT_ZOMBIE && + !task_detached(p) && thread_group_empty(p)) + do_notify_parent(p, p->exit_signal); + + kill_orphaned_pgrp(p, father); +} + +/* + * When we die, we re-parent all our children. + * Try to give them to another thread in our thread + * group, and if no such member exists, give it to + * the child reaper process (ie "init") in our pid + * space. + */ +static struct task_struct *find_new_reaper(struct task_struct *father) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *thread; + + thread = father; + while_each_thread(father, thread) { + if (thread->flags & PF_EXITING) + continue; + if (unlikely(pid_ns->child_reaper == father)) + pid_ns->child_reaper = thread; + return thread; + } + + if (unlikely(pid_ns->child_reaper == father)) { + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) + panic("Attempted to kill init!"); + + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; + } + + return pid_ns->child_reaper; +} + +static void forget_original_parent(struct task_struct *father) +{ + struct task_struct *p, *n, *reaper; + LIST_HEAD(ptrace_dead); + + write_lock_irq(&tasklist_lock); + reaper = find_new_reaper(father); + /* + * First clean up ptrace if we were using it. + */ + ptrace_exit(father, &ptrace_dead); + + list_for_each_entry_safe(p, n, &father->children, sibling) { + p->real_parent = reaper; + if (p->parent == father) { + BUG_ON(p->ptrace); + p->parent = p->real_parent; + } + reparent_thread(p, father); + } + + write_unlock_irq(&tasklist_lock); + BUG_ON(!list_empty(&father->children)); + + ptrace_exit_finish(father, &ptrace_dead); +} + +/* + * Send signals to all our closest relatives so that they know + * to properly mourn us.. + */ +static void exit_notify(struct task_struct *tsk, int group_dead) +{ + int signal; + void *cookie; + + /* + * This does two things: + * + * A. Make init inherit all the child processes + * B. Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ + forget_original_parent(tsk); + exit_task_namespaces(tsk); + + write_lock_irq(&tasklist_lock); + if (group_dead) + kill_orphaned_pgrp(tsk->group_leader, NULL); + + /* Let father know we died + * + * Thread signals are configurable, but you aren't going to use + * that to send signals to arbitary processes. + * That stops right now. + * + * If the parent exec id doesn't match the exec id we saved + * when we started then we know the parent has changed security + * domain. + * + * If our self_exec id doesn't match our parent_exec_id then + * we have changed execution domain as these two values started + * the same after a fork. + */ + if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && + (tsk->parent_exec_id != tsk->real_parent->self_exec_id || + tsk->self_exec_id != tsk->parent_exec_id) && + !capable(CAP_KILL)) + tsk->exit_signal = SIGCHLD; + + signal = tracehook_notify_death(tsk, &cookie, group_dead); + if (signal >= 0) + signal = do_notify_parent(tsk, signal); + + tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; + + /* mt-exec, de_thread() is waiting for us */ + if (thread_group_leader(tsk) && + tsk->signal->group_exit_task && + tsk->signal->notify_count < 0) + wake_up_process(tsk->signal->group_exit_task); + + write_unlock_irq(&tasklist_lock); + + tracehook_report_death(tsk, signal, cookie, group_dead); + + /* If the process is dead, release it - nobody will wait for it */ + if (signal == DEATH_REAP) + release_task(tsk); +} + +#ifdef CONFIG_DEBUG_STACK_USAGE +static void check_stack_usage(void) +{ + static DEFINE_SPINLOCK(low_water_lock); + static int lowest_to_date = THREAD_SIZE; + unsigned long *n = end_of_stack(current); + unsigned long free; + + while (*n == 0) + n++; + free = (unsigned long)n - (unsigned long)end_of_stack(current); + + if (free >= lowest_to_date) + return; + + spin_lock(&low_water_lock); + if (free < lowest_to_date) { + printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " + "left\n", + current->comm, free); + lowest_to_date = free; + } + spin_unlock(&low_water_lock); +} +#else +static inline void check_stack_usage(void) {} +#endif + +NORET_TYPE void do_exit(long code) +{ + struct task_struct *tsk = current; + int group_dead; + + profile_task_exit(tsk); + + WARN_ON(atomic_read(&tsk->fs_excl)); + + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); + + tracehook_report_exit(&code); + + /* + * We're taking recursive faults here in do_exit. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + printk(KERN_ALERT + "Fixing recursive fault but reboot is needed!\n"); + /* + * We can do this unlocked here. The futex code uses + * this flag just to verify whether the pi state + * cleanup has been done or not. In the worst case it + * loops once more. We pretend that the cleanup was + * done as there is no way to return. Either the + * OWNER_DIED bit is set by now or we push the blocked + * task into the wait for ever nirwana as well. + */ + tsk->flags |= PF_EXITPIDONE; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + } + + exit_signals(tsk); /* sets PF_EXITING */ + /* + * tsk->flags are checked in the futex code to protect against + * an exiting task cleaning up the robust pi futexes. + */ + smp_mb(); + spin_unlock_wait(&tsk->pi_lock); + + if (unlikely(in_atomic())) + printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + + acct_update_integrals(tsk); + + group_dead = atomic_dec_and_test(&tsk->signal->live); + if (group_dead) { + hrtimer_cancel(&tsk->signal->real_timer); + exit_itimers(tsk->signal); + } + acct_collect(code, group_dead); + if (group_dead) + tty_audit_exit(); + if (unlikely(tsk->audit_context)) + audit_free(tsk); + + tsk->exit_code = code; + taskstats_exit(tsk, group_dead); + + exit_mm(tsk); + + if (group_dead) + acct_process(); + trace_sched_process_exit(tsk); + + exit_sem(tsk); + exit_files(tsk); + exit_fs(tsk); + check_stack_usage(); + exit_thread(); + cgroup_exit(tsk, 1); + + if (group_dead && tsk->signal->leader) + disassociate_ctty(1); + + module_put(task_thread_info(tsk)->exec_domain->module); + if (tsk->binfmt) + module_put(tsk->binfmt->module); + + proc_exit_connector(tsk); + exit_notify(tsk, group_dead); +#ifdef CONFIG_NUMA + mpol_put(tsk->mempolicy); + tsk->mempolicy = NULL; +#endif +#ifdef CONFIG_FUTEX + /* + * This must happen late, after the PID is not + * hashed anymore: + */ + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); + if (unlikely(current->pi_state_cache)) + kfree(current->pi_state_cache); +#endif + /* + * Make sure we are holding no locks: + */ + debug_check_no_locks_held(tsk); + /* + * We can do this unlocked here. The futex code uses this flag + * just to verify whether the pi state cleanup has been done + * or not. In the worst case it loops once more. + */ + tsk->flags |= PF_EXITPIDONE; + + if (tsk->io_context) + exit_io_context(); + + if (tsk->splice_pipe) + __free_pipe_info(tsk->splice_pipe); + + preempt_disable(); + /* causes final put_task_struct in finish_task_switch(). */ + tsk->state = TASK_DEAD; + schedule(); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) + cpu_relax(); /* For when BUG is null */ +} + +EXPORT_SYMBOL_GPL(do_exit); + +#endif /* !DDE_LINUX */ + +NORET_TYPE void complete_and_exit(struct completion *comp, long code) +{ + if (comp) + complete(comp); + + do_exit(code); +} + +EXPORT_SYMBOL(complete_and_exit); + +#ifndef DDE_LINUX +SYSCALL_DEFINE1(exit, int, error_code) +{ + do_exit((error_code&0xff)<<8); +} + +/* + * Take down every thread in the group. This is called by fatal signals + * as well as by sys_exit_group (below). + */ +NORET_TYPE void +do_group_exit(int exit_code) +{ + struct signal_struct *sig = current->signal; + + BUG_ON(exit_code & 0x80); /* core dumps don't get here */ + + if (signal_group_exit(sig)) + exit_code = sig->group_exit_code; + else if (!thread_group_empty(current)) { + struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); + if (signal_group_exit(sig)) + /* Another thread got here before we took the lock. */ + exit_code = sig->group_exit_code; + else { + sig->group_exit_code = exit_code; + sig->flags = SIGNAL_GROUP_EXIT; + zap_other_threads(current); + } + spin_unlock_irq(&sighand->siglock); + } + + do_exit(exit_code); + /* NOTREACHED */ +} + +/* + * this kills every thread in the thread group. Note that any externally + * wait4()-ing process will get the correct exit code - even if this + * thread is not the thread group leader. + */ +SYSCALL_DEFINE1(exit_group, int, error_code) +{ + do_group_exit((error_code & 0xff) << 8); + /* NOTREACHED */ + return 0; +} + +static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +{ + struct pid *pid = NULL; + if (type == PIDTYPE_PID) + pid = task->pids[type].pid; + else if (type < PIDTYPE_MAX) + pid = task->group_leader->pids[type].pid; + return pid; +} + +static int eligible_child(enum pid_type type, struct pid *pid, int options, + struct task_struct *p) +{ + int err; + + if (type < PIDTYPE_MAX) { + if (task_pid_type(p, type) != pid) + return 0; + } + + /* Wait for all children (clone and not) if __WALL is set; + * otherwise, wait for clone children *only* if __WCLONE is + * set; otherwise, wait for non-clone children *only*. (Note: + * A "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD.) */ + if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) + && !(options & __WALL)) + return 0; + + err = security_task_wait(p); + if (err) + return err; + + return 1; +} + +static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, + int why, int status, + struct siginfo __user *infop, + struct rusage __user *rusagep) +{ + int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; + + put_task_struct(p); + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + if (!retval) + retval = pid; + return retval; +} + +/* + * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_zombie(struct task_struct *p, int options, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + unsigned long state; + int retval, status, traced; + pid_t pid = task_pid_vnr(p); + uid_t uid = __task_cred(p)->uid; + + if (!likely(options & WEXITED)) + return 0; + + if (unlikely(options & WNOWAIT)) { + int exit_code = p->exit_code; + int why, status; + + get_task_struct(p); + read_unlock(&tasklist_lock); + if ((exit_code & 0x7f) == 0) { + why = CLD_EXITED; + status = exit_code >> 8; + } else { + why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; + status = exit_code & 0x7f; + } + return wait_noreap_copyout(p, pid, uid, why, + status, infop, ru); + } + + /* + * Try to move the task's state to DEAD + * only one thread is allowed to do this: + */ + state = xchg(&p->exit_state, EXIT_DEAD); + if (state != EXIT_ZOMBIE) { + BUG_ON(state != EXIT_DEAD); + return 0; + } + + traced = ptrace_reparented(p); + + if (likely(!traced)) { + struct signal_struct *psig; + struct signal_struct *sig; + struct task_cputime cputime; + + /* + * The resource counters for the group leader are in its + * own task_struct. Those for dead threads in the group + * are in its signal_struct, as are those for the child + * processes it has previously reaped. All these + * accumulate in the parent's signal_struct c* fields. + * + * We don't bother to take a lock here to protect these + * p->signal fields, because they are only touched by + * __exit_signal, which runs with tasklist_lock + * write-locked anyway, and so is excluded here. We do + * need to protect the access to p->parent->signal fields, + * as other threads in the parent group can be right + * here reaping other children at the same time. + * + * We use thread_group_cputime() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. + */ + thread_group_cputime(p, &cputime); + spin_lock_irq(&p->parent->sighand->siglock); + psig = p->parent->signal; + sig = p->signal; + psig->cutime = + cputime_add(psig->cutime, + cputime_add(cputime.utime, + sig->cutime)); + psig->cstime = + cputime_add(psig->cstime, + cputime_add(cputime.stime, + sig->cstime)); + psig->cgtime = + cputime_add(psig->cgtime, + cputime_add(p->gtime, + cputime_add(sig->gtime, + sig->cgtime))); + psig->cmin_flt += + p->min_flt + sig->min_flt + sig->cmin_flt; + psig->cmaj_flt += + p->maj_flt + sig->maj_flt + sig->cmaj_flt; + psig->cnvcsw += + p->nvcsw + sig->nvcsw + sig->cnvcsw; + psig->cnivcsw += + p->nivcsw + sig->nivcsw + sig->cnivcsw; + psig->cinblock += + task_io_get_inblock(p) + + sig->inblock + sig->cinblock; + psig->coublock += + task_io_get_oublock(p) + + sig->oublock + sig->coublock; + task_io_accounting_add(&psig->ioac, &p->ioac); + task_io_accounting_add(&psig->ioac, &sig->ioac); + spin_unlock_irq(&p->parent->sighand->siglock); + } + + /* + * Now we are sure this task is interesting, and no other + * thread can reap it because we set its state to EXIT_DEAD. + */ + read_unlock(&tasklist_lock); + + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; + if (!retval && stat_addr) + retval = put_user(status, stat_addr); + if (!retval && infop) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval && infop) + retval = put_user(0, &infop->si_errno); + if (!retval && infop) { + int why; + + if ((status & 0x7f) == 0) { + why = CLD_EXITED; + status >>= 8; + } else { + why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + status &= 0x7f; + } + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(status, &infop->si_status); + } + if (!retval && infop) + retval = put_user(pid, &infop->si_pid); + if (!retval && infop) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = pid; + + if (traced) { + write_lock_irq(&tasklist_lock); + /* We dropped tasklist, ptracer could die and untrace */ + ptrace_unlink(p); + /* + * If this is not a detached task, notify the parent. + * If it's still not detached after that, don't release + * it now. + */ + if (!task_detached(p)) { + do_notify_parent(p, p->exit_signal); + if (!task_detached(p)) { + p->exit_state = EXIT_ZOMBIE; + p = NULL; + } + } + write_unlock_irq(&tasklist_lock); + } + if (p != NULL) + release_task(p); + + return retval; +} + +/* + * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_stopped(int ptrace, struct task_struct *p, + int options, struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int retval, exit_code, why; + uid_t uid = 0; /* unneeded, required by compiler */ + pid_t pid; + + if (!(options & WUNTRACED)) + return 0; + + exit_code = 0; + spin_lock_irq(&p->sighand->siglock); + + if (unlikely(!task_is_stopped_or_traced(p))) + goto unlock_sig; + + if (!ptrace && p->signal->group_stop_count > 0) + /* + * A group stop is in progress and this is the group leader. + * We won't report until all threads have stopped. + */ + goto unlock_sig; + + exit_code = p->exit_code; + if (!exit_code) + goto unlock_sig; + + if (!unlikely(options & WNOWAIT)) + p->exit_code = 0; + + /* don't need the RCU readlock here as we're holding a spinlock */ + uid = __task_cred(p)->uid; +unlock_sig: + spin_unlock_irq(&p->sighand->siglock); + if (!exit_code) + return 0; + + /* + * Now we are pretty sure this task is interesting. + * Make sure it doesn't get reaped out from under us while we + * give up the lock and then examine it below. We don't want to + * keep holding onto the tasklist_lock while we call getrusage and + * possibly take page faults for user memory. + */ + get_task_struct(p); + pid = task_pid_vnr(p); + why = ptrace ? CLD_TRAPPED : CLD_STOPPED; + read_unlock(&tasklist_lock); + + if (unlikely(options & WNOWAIT)) + return wait_noreap_copyout(p, pid, uid, + why, exit_code, + infop, ru); + + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + if (!retval && stat_addr) + retval = put_user((exit_code << 8) | 0x7f, stat_addr); + if (!retval && infop) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval && infop) + retval = put_user(0, &infop->si_errno); + if (!retval && infop) + retval = put_user((short)why, &infop->si_code); + if (!retval && infop) + retval = put_user(exit_code, &infop->si_status); + if (!retval && infop) + retval = put_user(pid, &infop->si_pid); + if (!retval && infop) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = pid; + put_task_struct(p); + + BUG_ON(!retval); + return retval; +} + +/* + * Handle do_wait work for one task in a live, non-stopped state. + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_continued(struct task_struct *p, int options, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int retval; + pid_t pid; + uid_t uid; + + if (!unlikely(options & WCONTINUED)) + return 0; + + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) + return 0; + + spin_lock_irq(&p->sighand->siglock); + /* Re-check with the lock held. */ + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { + spin_unlock_irq(&p->sighand->siglock); + return 0; + } + if (!unlikely(options & WNOWAIT)) + p->signal->flags &= ~SIGNAL_STOP_CONTINUED; + uid = __task_cred(p)->uid; + spin_unlock_irq(&p->sighand->siglock); + + pid = task_pid_vnr(p); + get_task_struct(p); + read_unlock(&tasklist_lock); + + if (!infop) { + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + put_task_struct(p); + if (!retval && stat_addr) + retval = put_user(0xffff, stat_addr); + if (!retval) + retval = pid; + } else { + retval = wait_noreap_copyout(p, pid, uid, + CLD_CONTINUED, SIGCONT, + infop, ru); + BUG_ON(retval == 0); + } + + return retval; +} + +/* + * Consider @p for a wait by @parent. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; + * then *@notask_error is 0 if @p is an eligible child, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int wait_consider_task(struct task_struct *parent, int ptrace, + struct task_struct *p, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int ret = eligible_child(type, pid, options, p); + if (!ret) + return ret; + + if (unlikely(ret < 0)) { + /* + * If we have not yet seen any eligible child, + * then let this error code replace -ECHILD. + * A permission error will give the user a clue + * to look for security policy problems, rather + * than for mysterious wait bugs. + */ + if (*notask_error) + *notask_error = ret; + } + + if (likely(!ptrace) && unlikely(p->ptrace)) { + /* + * This child is hidden by ptrace. + * We aren't allowed to see it now, but eventually we will. + */ + *notask_error = 0; + return 0; + } + + if (p->exit_state == EXIT_DEAD) + return 0; + + /* + * We don't reap group leaders with subthreads. + */ + if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) + return wait_task_zombie(p, options, infop, stat_addr, ru); + + /* + * It's stopped or running now, so it might + * later continue, exit, or stop again. + */ + *notask_error = 0; + + if (task_is_stopped_or_traced(p)) + return wait_task_stopped(ptrace, p, options, + infop, stat_addr, ru); + + return wait_task_continued(p, options, infop, stat_addr, ru); +} + +/* + * Do the work of do_wait() for one thread in the group, @tsk. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; then + * *@notask_error is 0 if there were any eligible children, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int do_wait_thread(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + list_for_each_entry(p, &tsk->children, sibling) { + /* + * Do not consider detached threads. + */ + if (!task_detached(p)) { + int ret = wait_consider_task(tsk, 0, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + } + + return 0; +} + +static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + /* + * Traditionally we see ptrace'd stopped tasks regardless of options. + */ + options |= WUNTRACED; + + list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { + int ret = wait_consider_task(tsk, 1, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + + return 0; +} + +static long do_wait(enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + DECLARE_WAITQUEUE(wait, current); + struct task_struct *tsk; + int retval; + + trace_sched_process_wait(pid); + + add_wait_queue(¤t->signal->wait_chldexit,&wait); +repeat: + /* + * If there is nothing that can match our critiera just get out. + * We will clear @retval to zero if we see any child that might later + * match our criteria, even if we are not able to reap it yet. + */ + retval = -ECHILD; + if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) + goto end; + + current->state = TASK_INTERRUPTIBLE; + read_lock(&tasklist_lock); + tsk = current; + do { + int tsk_result = do_wait_thread(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (!tsk_result) + tsk_result = ptrace_do_wait(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (tsk_result) { + /* + * tasklist_lock is unlocked and we have a final result. + */ + retval = tsk_result; + goto end; + } + + if (options & __WNOTHREAD) + break; + tsk = next_thread(tsk); + BUG_ON(tsk->signal != current->signal); + } while (tsk != current); + read_unlock(&tasklist_lock); + + if (!retval && !(options & WNOHANG)) { + retval = -ERESTARTSYS; + if (!signal_pending(current)) { + schedule(); + goto repeat; + } + } + +end: + current->state = TASK_RUNNING; + remove_wait_queue(¤t->signal->wait_chldexit,&wait); + if (infop) { + if (retval > 0) + retval = 0; + else { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!retval) + retval = put_user(0, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user(0, &infop->si_code); + if (!retval) + retval = put_user(0, &infop->si_pid); + if (!retval) + retval = put_user(0, &infop->si_uid); + if (!retval) + retval = put_user(0, &infop->si_status); + } + } + return retval; +} + +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) +{ + struct pid *pid = NULL; + enum pid_type type; + long ret; + + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) + return -EINVAL; + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) + return -EINVAL; + + switch (which) { + case P_ALL: + type = PIDTYPE_MAX; + break; + case P_PID: + type = PIDTYPE_PID; + if (upid <= 0) + return -EINVAL; + break; + case P_PGID: + type = PIDTYPE_PGID; + if (upid <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (type < PIDTYPE_MAX) + pid = find_get_pid(upid); + ret = do_wait(type, pid, options, infop, NULL, ru); + put_pid(pid); + + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(5, ret, which, upid, infop, options, ru); + return ret; +} + +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) +{ + struct pid *pid = NULL; + enum pid_type type; + long ret; + + if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| + __WNOTHREAD|__WCLONE|__WALL)) + return -EINVAL; + + if (upid == -1) + type = PIDTYPE_MAX; + else if (upid < 0) { + type = PIDTYPE_PGID; + pid = find_get_pid(-upid); + } else if (upid == 0) { + type = PIDTYPE_PGID; + pid = get_pid(task_pgrp(current)); + } else /* upid > 0 */ { + type = PIDTYPE_PID; + pid = find_get_pid(upid); + } + + ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru); + put_pid(pid); + + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(4, ret, upid, stat_addr, options, ru); + return ret; +} + +#ifdef __ARCH_WANT_SYS_WAITPID + +/* + * sys_waitpid() remains for compatibility. waitpid() should be + * implemented by calling sys_wait4() from libc.a. + */ +SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) +{ + return sys_wait4(pid, stat_addr, options, NULL); +} + +#endif +#endif diff --git a/libdde-linux26/lib/src/kernel/irq/handle.c b/libdde-linux26/lib/src/kernel/irq/handle.c new file mode 100644 index 00000000..ac7b14f8 --- /dev/null +++ b/libdde-linux26/lib/src/kernel/irq/handle.c @@ -0,0 +1,23 @@ +/* + * linux/kernel/irq/handle.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the core interrupt handling code. + * + * Detailed information is available in Documentation/DocBook/genericirq + * + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/rculist.h> +#include <linux/hash.h> + +int nr_irqs = NR_IRQS; +EXPORT_SYMBOL_GPL(nr_irqs); + diff --git a/libdde-linux26/lib/src/kernel/resource.c b/libdde-linux26/lib/src/kernel/resource.c new file mode 100644 index 00000000..3dd07a35 --- /dev/null +++ b/libdde-linux26/lib/src/kernel/resource.c @@ -0,0 +1,936 @@ +/* + * linux/kernel/resource.c + * + * Copyright (C) 1999 Linus Torvalds + * Copyright (C) 1999 Martin Mares <mj@ucw.cz> + * + * Arbitrary resource management. + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/ioport.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/fs.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/device.h> +#include <linux/pfn.h> +#include <asm/io.h> + + +struct resource ioport_resource = { + .name = "PCI IO", + .start = 0, + .end = IO_SPACE_LIMIT, + .flags = IORESOURCE_IO, +}; +EXPORT_SYMBOL(ioport_resource); + +struct resource iomem_resource = { + .name = "PCI mem", + .start = 0, + .end = -1, + .flags = IORESOURCE_MEM, +}; +EXPORT_SYMBOL(iomem_resource); + +static DEFINE_RWLOCK(resource_lock); + +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + if (p->child) + return p->child; + while (!p->sibling && p->parent) + p = p->parent; + return p->sibling; +} + +#ifdef CONFIG_PROC_FS + +enum { MAX_IORES_LEVEL = 5 }; + +static void *r_start(struct seq_file *m, loff_t *pos) + __acquires(resource_lock) +{ + struct resource *p = m->private; + loff_t l = 0; + read_lock(&resource_lock); + for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) + ; + return p; +} + +static void r_stop(struct seq_file *m, void *v) + __releases(resource_lock) +{ + read_unlock(&resource_lock); +} + +static int r_show(struct seq_file *m, void *v) +{ + struct resource *root = m->private; + struct resource *r = v, *p; + int width = root->end < 0x10000 ? 4 : 8; + int depth; + + for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) + if (p->parent == root) + break; + seq_printf(m, "%*s%0*llx-%0*llx : %s\n", + depth * 2, "", + width, (unsigned long long) r->start, + width, (unsigned long long) r->end, + r->name ? r->name : "<BAD>"); + return 0; +} + +static const struct seq_operations resource_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = r_show, +}; + +static int ioports_open(struct inode *inode, struct file *file) +{ + int res = seq_open(file, &resource_op); + if (!res) { + struct seq_file *m = file->private_data; + m->private = &ioport_resource; + } + return res; +} + +static int iomem_open(struct inode *inode, struct file *file) +{ + int res = seq_open(file, &resource_op); + if (!res) { + struct seq_file *m = file->private_data; + m->private = &iomem_resource; + } + return res; +} + +static const struct file_operations proc_ioports_operations = { + .open = ioports_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations proc_iomem_operations = { + .open = iomem_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init ioresources_init(void) +{ + proc_create("ioports", 0, NULL, &proc_ioports_operations); + proc_create("iomem", 0, NULL, &proc_iomem_operations); + return 0; +} +__initcall(ioresources_init); + +#endif /* CONFIG_PROC_FS */ + +/* Return the conflict entry if you can't request it */ +static struct resource * __request_resource(struct resource *root, struct resource *new) +{ + resource_size_t start = new->start; + resource_size_t end = new->end; + struct resource *tmp, **p; + + if (end < start) + return root; + if (start < root->start) + return root; + if (end > root->end) + return root; + p = &root->child; + for (;;) { + tmp = *p; + if (!tmp || tmp->start > end) { + new->sibling = tmp; + *p = new; + new->parent = root; + return NULL; + } + p = &tmp->sibling; + if (tmp->end < start) + continue; + return tmp; + } +} + +static int __release_resource(struct resource *old) +{ + struct resource *tmp, **p; + + p = &old->parent->child; + for (;;) { + tmp = *p; + if (!tmp) + break; + if (tmp == old) { + *p = tmp->sibling; + old->parent = NULL; + return 0; + } + p = &tmp->sibling; + } + return -EINVAL; +} + +/** + * request_resource - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, negative error code on error. + */ +int request_resource(struct resource *root, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __request_resource(root, new); + write_unlock(&resource_lock); + return conflict ? -EBUSY : 0; +} + +EXPORT_SYMBOL(request_resource); + +/** + * release_resource - release a previously reserved resource + * @old: resource pointer + */ +int release_resource(struct resource *old) +{ + int retval; + + write_lock(&resource_lock); + retval = __release_resource(old); + write_unlock(&resource_lock); + return retval; +} + +EXPORT_SYMBOL(release_resource); + +#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) +/* + * Finds the lowest memory reosurce exists within [res->start.res->end) + * the caller must specify res->start, res->end, res->flags. + * If found, returns 0, res is overwritten, if not found, returns -1. + */ +static int find_next_system_ram(struct resource *res) +{ + resource_size_t start, end; + struct resource *p; + + BUG_ON(!res); + + start = res->start; + end = res->end; + BUG_ON(start >= end); + + read_lock(&resource_lock); + for (p = iomem_resource.child; p ; p = p->sibling) { + /* system ram is just marked as IORESOURCE_MEM */ + if (p->flags != res->flags) + continue; + if (p->start > end) { + p = NULL; + break; + } + if ((p->end >= start) && (p->start < end)) + break; + } + read_unlock(&resource_lock); + if (!p) + return -1; + /* copy data */ + if (res->start < p->start) + res->start = p->start; + if (res->end > p->end) + res->end = p->end; + return 0; +} +int +walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, + int (*func)(unsigned long, unsigned long, void *)) +{ + struct resource res; + unsigned long pfn, len; + u64 orig_end; + int ret = -1; + res.start = (u64) start_pfn << PAGE_SHIFT; + res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { + pfn = (unsigned long)(res.start >> PAGE_SHIFT); + len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); + ret = (*func)(pfn, len, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#endif + +/* + * Find empty slot in the resource tree given range and alignment. + */ +static int find_resource(struct resource *root, struct resource *new, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, + void (*alignf)(void *, struct resource *, + resource_size_t, resource_size_t), + void *alignf_data) +{ + struct resource *this = root->child; + + new->start = root->start; + /* + * Skip past an allocated resource that starts at 0, since the assignment + * of this->start - 1 to new->end below would cause an underflow. + */ + if (this && this->start == 0) { + new->start = this->end + 1; + this = this->sibling; + } + for(;;) { + if (this) + new->end = this->start - 1; + else + new->end = root->end; + if (new->start < min) + new->start = min; + if (new->end > max) + new->end = max; + new->start = ALIGN(new->start, align); + if (alignf) + alignf(alignf_data, new, size, align); + if (new->start < new->end && new->end - new->start >= size - 1) { + new->end = new->start + size - 1; + return 0; + } + if (!this) + break; + new->start = this->end + 1; + this = this->sibling; + } + return -EBUSY; +} + +/** + * allocate_resource - allocate empty slot in the resource tree given range & alignment + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * @size: requested resource region size + * @min: minimum size to allocate + * @max: maximum size to allocate + * @align: alignment requested, in bytes + * @alignf: alignment function, optional, called if not NULL + * @alignf_data: arbitrary data to pass to the @alignf function + */ +int allocate_resource(struct resource *root, struct resource *new, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, + void (*alignf)(void *, struct resource *, + resource_size_t, resource_size_t), + void *alignf_data) +{ + int err; + + write_lock(&resource_lock); + err = find_resource(root, new, size, min, max, align, alignf, alignf_data); + if (err >= 0 && __request_resource(root, new)) + err = -EBUSY; + write_unlock(&resource_lock); + return err; +} + +EXPORT_SYMBOL(allocate_resource); + +/* + * Insert a resource into the resource tree. If successful, return NULL, + * otherwise return the conflicting resource (compare to __request_resource()) + */ +static struct resource * __insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *first, *next; + + for (;; parent = first) { + first = __request_resource(parent, new); + if (!first) + return first; + + if (first == parent) + return first; + + if ((first->start > new->start) || (first->end < new->end)) + break; + if ((first->start == new->start) && (first->end == new->end)) + break; + } + + for (next = first; ; next = next->sibling) { + /* Partial overlap? Bad, and unfixable */ + if (next->start < new->start || next->end > new->end) + return next; + if (!next->sibling) + break; + if (next->sibling->start > new->end) + break; + } + + new->parent = parent; + new->sibling = next->sibling; + new->child = first; + + next->sibling = NULL; + for (next = first; next; next = next->sibling) + next->parent = new; + + if (parent->child == first) { + parent->child = new; + } else { + next = parent->child; + while (next->sibling != first) + next = next->sibling; + next->sibling = new; + } + return NULL; +} + +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is equivalent to request_resource when no conflict + * happens. If a conflict happens, and the conflicting resources + * entirely fit within the range of the new resource, then the new + * resource is inserted and the conflicting resources become children of + * the new resource. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __insert_resource(parent, new); + write_unlock(&resource_lock); + return conflict ? -EBUSY : 0; +} + +/** + * insert_resource_expand_to_fit - Insert a resource into the resource tree + * @root: root resource descriptor + * @new: new resource to insert + * + * Insert a resource into the resource tree, possibly expanding it in order + * to make it encompass any conflicting resources. + */ +void insert_resource_expand_to_fit(struct resource *root, struct resource *new) +{ + if (new->parent) + return; + + write_lock(&resource_lock); + for (;;) { + struct resource *conflict; + + conflict = __insert_resource(root, new); + if (!conflict) + break; + if (conflict == root) + break; + + /* Ok, expand resource to cover the conflict, then try again .. */ + if (conflict->start < new->start) + new->start = conflict->start; + if (conflict->end > new->end) + new->end = conflict->end; + + printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); + } + write_unlock(&resource_lock); +} + +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * + * Given an existing resource, change its start and size to match the + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) +{ + struct resource *tmp, *parent = res->parent; + resource_size_t end = start + size - 1; + int result = -EBUSY; + + write_lock(&resource_lock); + + if ((start < parent->start) || (end > parent->end)) + goto out; + + for (tmp = res->child; tmp; tmp = tmp->sibling) { + if ((tmp->start < start) || (tmp->end > end)) + goto out; + } + + if (res->sibling && (res->sibling->start <= end)) + goto out; + + tmp = parent->child; + if (tmp != res) { + while (tmp->sibling != res) + tmp = tmp->sibling; + if (start <= tmp->end) + goto out; + } + + res->start = start; + res->end = end; + result = 0; + + out: + write_unlock(&resource_lock); + return result; +} + +static void __init __reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + struct resource *parent = root; + struct resource *conflict; + struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + + if (!res) + return; + + res->name = name; + res->start = start; + res->end = end; + res->flags = IORESOURCE_BUSY; + + for (;;) { + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + + if (!res) { + /* failed, split and try again */ + + /* conflict covered whole area */ + if (conflict->start <= start && conflict->end >= end) + return; + + if (conflict->start > start) + __reserve_region_with_split(root, start, conflict->start-1, name); + if (!(conflict->flags & IORESOURCE_BUSY)) { + resource_size_t common_start, common_end; + + common_start = max(conflict->start, start); + common_end = min(conflict->end, end); + if (common_start < common_end) + __reserve_region_with_split(root, common_start, common_end, name); + } + if (conflict->end < end) + __reserve_region_with_split(root, conflict->end+1, end, name); + } + +} + +void __init reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + write_lock(&resource_lock); + __reserve_region_with_split(root, start, end, name); + write_unlock(&resource_lock); +} + +EXPORT_SYMBOL(adjust_resource); + +/** + * resource_alignment - calculate resource's alignment + * @res: resource pointer + * + * Returns alignment on success, 0 (invalid alignment) on failure. + */ +resource_size_t resource_alignment(struct resource *res) +{ + switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { + case IORESOURCE_SIZEALIGN: + return resource_size(res); + case IORESOURCE_STARTALIGN: + return res->start; + default: + return 0; + } +} + +/* + * This is compatibility stuff for IO resources. + * + * Note how this, unlike the above, knows about + * the IO flag meanings (busy etc). + * + * request_region creates a new busy region. + * + * check_region returns non-zero if the area is already busy. + * + * release_region releases a matching busy region. + */ + +#ifndef DDE_LINUX +/** + * __request_region - create a new busy resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * @name: reserving caller's ID string + * @flags: IO resource flags + */ +struct resource * __request_region(struct resource *parent, + resource_size_t start, resource_size_t n, + const char *name, int flags) +{ + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + + if (!res) + return NULL; + + res->name = name; + res->start = start; + res->end = start + n - 1; + res->flags = IORESOURCE_BUSY; + res->flags |= flags; + + write_lock(&resource_lock); + + for (;;) { + struct resource *conflict; + + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + write_unlock(&resource_lock); + return res; +} +EXPORT_SYMBOL(__request_region); + +/** + * __check_region - check if a resource region is busy or free + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * Returns 0 if the region is free at the moment it is checked, + * returns %-EBUSY if the region is busy. + * + * NOTE: + * This function is deprecated because its use is racy. + * Even if it returns 0, a subsequent call to request_region() + * may fail because another driver etc. just allocated the region. + * Do NOT use it. It will be removed from the kernel. + */ +int __check_region(struct resource *parent, resource_size_t start, + resource_size_t n) +{ + struct resource * res; + + res = __request_region(parent, start, n, "check-region", 0); + if (!res) + return -EBUSY; + + release_resource(res); + kfree(res); + return 0; +} +EXPORT_SYMBOL(__check_region); + +/** + * __release_region - release a previously reserved resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * The described resource region must match a currently busy region. + */ +void __release_region(struct resource *parent, resource_size_t start, + resource_size_t n) +{ + struct resource **p; + resource_size_t end; + + p = &parent->child; + end = start + n - 1; + + write_lock(&resource_lock); + + for (;;) { + struct resource *res = *p; + + if (!res) + break; + if (res->start <= start && res->end >= end) { + if (!(res->flags & IORESOURCE_BUSY)) { + p = &res->child; + continue; + } + if (res->start != start || res->end != end) + break; + *p = res->sibling; + write_unlock(&resource_lock); + kfree(res); + return; + } + p = &res->sibling; + } + + write_unlock(&resource_lock); + + printk(KERN_WARNING "Trying to free nonexistent resource " + "<%016llx-%016llx>\n", (unsigned long long)start, + (unsigned long long)end); +} +EXPORT_SYMBOL(__release_region); +#endif /* DDE_LINUX */ + +/* + * Managed region resource + */ +struct region_devres { + struct resource *parent; + resource_size_t start; + resource_size_t n; +}; + +static void devm_region_release(struct device *dev, void *res) +{ + struct region_devres *this = res; + + __release_region(this->parent, this->start, this->n); +} + +static int devm_region_match(struct device *dev, void *res, void *match_data) +{ + struct region_devres *this = res, *match = match_data; + + return this->parent == match->parent && + this->start == match->start && this->n == match->n; +} + +struct resource * __devm_request_region(struct device *dev, + struct resource *parent, resource_size_t start, + resource_size_t n, const char *name) +{ + struct region_devres *dr = NULL; + struct resource *res; + + dr = devres_alloc(devm_region_release, sizeof(struct region_devres), + GFP_KERNEL); + if (!dr) + return NULL; + + dr->parent = parent; + dr->start = start; + dr->n = n; + + res = __request_region(parent, start, n, name, 0); + if (res) + devres_add(dev, dr); + else + devres_free(dr); + + return res; +} +EXPORT_SYMBOL(__devm_request_region); + +void __devm_release_region(struct device *dev, struct resource *parent, + resource_size_t start, resource_size_t n) +{ + struct region_devres match_data = { parent, start, n }; + + __release_region(parent, start, n); + WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, + &match_data)); +} +EXPORT_SYMBOL(__devm_release_region); + +/* + * Called from init/main.c to reserve IO ports. + */ +#define MAXRESERVE 4 +static int __init reserve_setup(char *str) +{ + static int reserved; + static struct resource reserve[MAXRESERVE]; + + for (;;) { + int io_start, io_num; + int x = reserved; + + if (get_option (&str, &io_start) != 2) + break; + if (get_option (&str, &io_num) == 0) + break; + if (x < MAXRESERVE) { + struct resource *res = reserve + x; + res->name = "reserved"; + res->start = io_start; + res->end = io_start + io_num - 1; + res->flags = IORESOURCE_BUSY; + res->child = NULL; + if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) + reserved = x+1; + } + } + return 1; +} + +__setup("reserve=", reserve_setup); + +/* + * Check if the requested addr and size spans more than any slot in the + * iomem resource tree. + */ +int iomem_map_sanity_check(resource_size_t addr, unsigned long size) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + continue; + if (p->end < addr) + continue; + if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && + PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) + continue; + /* + * if a resource is "BUSY", it's not a hardware resource + * but a driver mapping of such a resource; we don't want + * to warn for those; some drivers legitimately map only + * partial hardware resources. (example: vesafb) + */ + if (p->flags & IORESOURCE_BUSY) + continue; + + printk(KERN_WARNING "resource map sanity check conflict: " + "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + (unsigned long long)addr, + (unsigned long long)(addr + size - 1), + (unsigned long long)p->start, + (unsigned long long)p->end, + p->name); + err = -1; + break; + } + read_unlock(&resource_lock); + + return err; +} + +#ifdef CONFIG_STRICT_DEVMEM +static int strict_iomem_checks = 1; +#else +static int strict_iomem_checks; +#endif + +/* + * check if an address is reserved in the iomem resource tree + * returns 1 if reserved, 0 if not reserved. + */ +int iomem_is_exclusive(u64 addr) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + int size = PAGE_SIZE; + + if (!strict_iomem_checks) + return 0; + + addr = addr & PAGE_MASK; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + break; + if (p->end < addr) + continue; + if (p->flags & IORESOURCE_BUSY && + p->flags & IORESOURCE_EXCLUSIVE) { + err = 1; + break; + } + } + read_unlock(&resource_lock); + + return err; +} + +static int __init strict_iomem(char *str) +{ + if (strstr(str, "relaxed")) + strict_iomem_checks = 0; + if (strstr(str, "strict")) + strict_iomem_checks = 1; + return 1; +} + +__setup("iomem=", strict_iomem); diff --git a/libdde-linux26/lib/src/kernel/sched.c b/libdde-linux26/lib/src/kernel/sched.c new file mode 100644 index 00000000..5c51695e --- /dev/null +++ b/libdde-linux26/lib/src/kernel/sched.c @@ -0,0 +1,9654 @@ +/* + * kernel/sched.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin + * 2007-04-15 Work begun on replacing all interactivity tuning with a + * fair scheduling design by Con Kolivas. + * 2007-05-05 Load balancing (smp-nice) and other improvements + * by Peter Williams + * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith + * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri + * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, + * Thomas Gleixner, Mike Kravetz + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <linux/uaccess.h> +#include <linux/highmem.h> +#include <linux/smp_lock.h> +#include <asm/mmu_context.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/completion.h> +#include <linux/kernel_stat.h> +#include <linux/debug_locks.h> +#include <linux/security.h> +#include <linux/notifier.h> +#include <linux/profile.h> +#include <linux/freezer.h> +#include <linux/vmalloc.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/pid_namespace.h> +#include <linux/smp.h> +#include <linux/threads.h> +#include <linux/timer.h> +#include <linux/rcupdate.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/percpu.h> +#include <linux/kthread.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/sysctl.h> +#include <linux/syscalls.h> +#include <linux/times.h> +#include <linux/tsacct_kern.h> +#include <linux/kprobes.h> +#include <linux/delayacct.h> +#include <linux/reciprocal_div.h> +#include <linux/unistd.h> +#include <linux/pagemap.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/bootmem.h> +#include <linux/debugfs.h> +#include <linux/ctype.h> +#include <linux/ftrace.h> +#include <trace/sched.h> + +#include <asm/tlb.h> +#include <asm/irq_regs.h> + +#include "sched_cpupri.h" + +#ifdef DDE_LINUX +/* DDE_LINUX implements this function externally */ +extern int try_to_wake_up(struct task_struct *p, unsigned int state, int sync); +#endif + +/** DDE only uses small parts of this. */ +#ifndef DDE_LINUX +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); +} + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +#define NICE_0_LOAD SCHED_LOAD_SCALE +#define NICE_0_SHIFT SCHED_LOAD_SHIFT + +/* + * These are the 'tuning knobs' of the scheduler: + * + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define DEF_TIMESLICE (100 * HZ / 1000) + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +DEFINE_TRACE(sched_wait_task); +DEFINE_TRACE(sched_wakeup); +DEFINE_TRACE(sched_wakeup_new); +DEFINE_TRACE(sched_switch); +DEFINE_TRACE(sched_migrate_task); + +#ifdef CONFIG_SMP + +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + +/* + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) + * Since cpu_power is a 'constant', we can use a reciprocal divide. + */ +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +{ + return reciprocal_divide(load, sg->reciprocal_cpu_power); +} + +/* + * Each time a sched group cpu_power is changed, + * we must compute its reciprocal value + */ +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) +{ + sg->__cpu_power += val; + sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); +} +#endif + +static inline int rt_policy(int policy) +{ + if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) + return 1; + return 0; +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +static struct rt_bandwidth def_rt_bandwidth; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; +} + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + ktime_t now; + + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + spin_lock(&rt_b->rt_runtime_lock); + for (;;) { + if (hrtimer_active(&rt_b->rt_period_timer)) + break; + + now = hrtimer_cb_get_time(&rt_b->rt_period_timer); + hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); + hrtimer_start_expires(&rt_b->rt_period_timer, + HRTIMER_MODE_ABS); + } + spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} +#endif + +/* + * sched_domains_mutex serializes calls to arch_init_sched_domains, + * detach_destroy_domains and partition_sched_domains. + */ +static DEFINE_MUTEX(sched_domains_mutex); + +#ifdef CONFIG_GROUP_SCHED + +#include <linux/cgroup.h> + +struct cfs_rq; + +static LIST_HEAD(task_groups); + +/* task group related information */ +struct task_group { +#ifdef CONFIG_CGROUP_SCHED + struct cgroup_subsys_state css; +#endif + +#ifdef CONFIG_USER_SCHED + uid_t uid; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; +}; + +#ifdef CONFIG_USER_SCHED + +/* Helper function to pass uid information to create_sched_user() */ +void set_tg_uid(struct user_struct *user) +{ + user->tg->uid = user->uid; +} + +/* + * Root task group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. + */ +struct task_group root_task_group; + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* Default task group's sched entity on each cpu */ +static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); +/* Default task group's cfs_rq on each cpu */ +static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +#endif /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ +#define root_task_group init_task_group +#endif /* CONFIG_USER_SCHED */ + +/* task_group_lock serializes add/remove of task groups and also changes to + * a task group's cpu shares. + */ +static DEFINE_SPINLOCK(task_group_lock); + +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_USER_SCHED +# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) +#else /* !CONFIG_USER_SCHED */ +# define INIT_TASK_GROUP_LOAD NICE_0_LOAD +#endif /* CONFIG_USER_SCHED */ + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES 2 +#define MAX_SHARES (1UL << 18) + +static int init_task_group_load = INIT_TASK_GROUP_LOAD; +#endif + +/* Default task group. + * Every task in system belong to this group at bootup. + */ +struct task_group init_task_group; + +/* return group to which a task belongs */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct task_group *tg; + +#ifdef CONFIG_USER_SCHED + rcu_read_lock(); + tg = __task_cred(p)->user->tg; + rcu_read_unlock(); +#elif defined(CONFIG_CGROUP_SCHED) + tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), + struct task_group, css); +#else + tg = &init_task_group; +#endif + return tg; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; +#endif +} + +#else + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_GROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long nr_running; + + u64 exec_clock; + u64 min_vruntime; + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr, *next, *last; + + unsigned int nr_spread_over; + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + /* + * the part of load.weight contributed by tasks + */ + unsigned long task_weight; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + + /* + * this cpu's part of tg->shares + */ + unsigned long shares; + + /* + * load.weight at the time we set shares + */ + unsigned long rq_weight; +#endif +#endif +}; + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned long rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + int highest_prio; /* highest queued rt task prio */ +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + int overloaded; +#endif + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct list_head leaf_rt_rq_list; + struct task_group *tg; + struct sched_rt_entity *rt_se; +#endif +}; + +#ifdef CONFIG_SMP + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + cpumask_var_t span; + cpumask_var_t online; + + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + atomic_t rto_count; +#ifdef CONFIG_SMP + struct cpupri cpupri; +#endif +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Preferred wake up cpu nominated by sched_mc balance that will be + * used when most cpus are idle in the system indicating overall very + * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) + */ + unsigned int sched_mc_preferred_wakeup_cpu; +#endif +}; + +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +static struct root_domain def_root_domain; + +#endif + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned char idle_at_tick; +#ifdef CONFIG_NO_HZ + unsigned long last_tick_seen; + unsigned char in_nohz_recently; +#endif + /* capture load from *all* tasks on this cpu: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; + struct rt_rq rt; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this cpu: */ + struct list_head leaf_cfs_rq_list; +#endif +#ifdef CONFIG_RT_GROUP_SCHED + struct list_head leaf_rt_rq_list; +#endif + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr, *idle; + unsigned long next_balance; + struct mm_struct *prev_mm; + + u64 clock; + + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + /* cpu of this runqueue: */ + int cpu; + int online; + + unsigned long avg_load_per_task; + + struct task_struct *migration_thread; + struct list_head migration_queue; +#endif + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif + struct hrtimer hrtick_timer; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_exp_empty; + unsigned int yld_act_empty; + unsigned int yld_both_empty; + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_switch; + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; + + /* BKL stats */ + unsigned int bkl_count; +#endif +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) +{ + rq->curr->sched_class->check_preempt_curr(rq, p, sync); +} + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +static inline void update_rq_clock(struct rq *rq) +{ + rq->clock = sched_clock_cpu(cpu_of(rq)); +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# define const_debug __read_mostly +#else +# define const_debug static const +#endif + +/** + * runqueue_is_locked + * + * Returns true if the current cpu runqueue is locked. + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(void) +{ + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + int ret; + + ret = spin_is_locked(&rq->lock); + put_cpu(); + return ret; +} + +/* + * Debugging: various feature bits + */ + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "sched_features.h" +}; + +#undef SCHED_FEAT + +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | + +const_debug unsigned int sysctl_sched_features = +#include "sched_features.h" + 0; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled) \ + #name , + +static __read_mostly char *sched_feat_names[] = { +#include "sched_features.h" + NULL +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; sched_feat_names[i]; i++) { + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; sched_feat_names[i]; i++) { + int len = strlen(sched_feat_names[i]); + + if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (neg) + sysctl_sched_features &= ~(1UL << i); + else + sysctl_sched_features |= (1UL << i); + break; + } + } + + if (!sched_feat_names[i]) + return -EINVAL; + + filp->f_pos += cnt; + + return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + +static struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#endif + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +/* + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +const_debug unsigned int sysctl_sched_nr_migrate = 32; + +/* + * ratelimit for updating the group shares. + * default: 0.25ms + */ +unsigned int sysctl_sched_shares_ratelimit = 250000; + +/* + * Inject some fuzzyness into changing the per-cpu group shares + * this avoids remote rq-locks at the expense of fairness. + * default: 4 + */ +unsigned int sysctl_sched_shares_thresh = 4; + +/* + * period over which we measure -rt task cpu usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +static __read_mostly int scheduler_running; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(struct rq *rq, struct task_struct *p) +{ + return task_current(rq, p); +} + +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return task_current(rq, p); +#endif +} + +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + spin_unlock_irq(&rq->lock); +#else + spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + +/* + * __task_rq_lock - lock the runqueue a given task resides on. + * Must be called interrupts disabled. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + for (;;) { + struct rq *rq = task_rq(p); + spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; + spin_unlock(&rq->lock); + } +} + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; + spin_unlock_irqrestore(&rq->lock, *flags); + } +} + +void task_rq_unlock_wait(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + smp_mb(); /* spin-unlock-wait is not a full memory barrier */ + spin_unlock_wait(&rq->lock); +} + +static void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + spin_unlock(&rq->lock); +} + +static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) + __releases(rq->lock) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +/* + * this_rq_lock - lock this runqueue and disable interrupts. + */ +static struct rq *this_rq_lock(void) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +#ifdef CONFIG_SCHED_HRTICK +/* + * Use HR-timers to deliver accurate preemption points. + * + * Its all a bit involved since we cannot program an hrt while holding the + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a + * reschedule event. + * + * When we get rescheduled we reprogram the hrtick_timer outside of the + * rq->lock. + */ + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +static void hrtick_clear(struct rq *rq) +{ + if (hrtimer_active(&rq->hrtick_timer)) + hrtimer_cancel(&rq->hrtick_timer); +} + +/* + * High-resolution timer tick. + * Runs from hardirq context with interrupts disabled. + */ +static enum hrtimer_restart hrtick(struct hrtimer *timer) +{ + struct rq *rq = container_of(timer, struct rq, hrtick_timer); + + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + + spin_lock(&rq->lock); + update_rq_clock(rq); + rq->curr->sched_class->task_tick(rq, rq->curr, 1); + spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +#ifdef CONFIG_SMP +/* + * called from hardirq (IPI) context + */ +static void __hrtick_start(void *arg) +{ + struct rq *rq = arg; + + spin_lock(&rq->lock); + hrtimer_restart(&rq->hrtick_timer); + rq->hrtick_csd_pending = 0; + spin_unlock(&rq->lock); +} + +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) +{ + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + + hrtimer_set_expires(timer, time); + + if (rq == this_rq()) { + hrtimer_restart(timer); + } else if (!rq->hrtick_csd_pending) { + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); + rq->hrtick_csd_pending = 1; + } +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + hrtick_clear(cpu_rq(cpu)); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static __init void init_hrtick(void) +{ + hotcpu_notifier(hotplug_hrtick, 0); +} +#else +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) +{ + hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SMP */ + +static void init_rq_hrtick(struct rq *rq) +{ +#ifdef CONFIG_SMP + rq->hrtick_csd_pending = 0; + + rq->hrtick_csd.flags = 0; + rq->hrtick_csd.func = __hrtick_start; + rq->hrtick_csd.info = rq; +#endif + + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; +} +#else /* CONFIG_SCHED_HRTICK */ +static inline void hrtick_clear(struct rq *rq) +{ +} + +static inline void init_rq_hrtick(struct rq *rq) +{ +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SCHED_HRTICK */ + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + +static void resched_task(struct task_struct *p) +{ + int cpu; + + assert_spin_locked(&task_rq(p)->lock); + + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + return; + + set_tsk_thread_flag(p, TIF_NEED_RESCHED); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); +} + +static void resched_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + spin_unlock_irqrestore(&rq->lock, flags); +} + +#ifdef CONFIG_NO_HZ +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} +#endif /* CONFIG_NO_HZ */ + +#else /* !CONFIG_SMP */ +static void resched_task(struct task_struct *p) +{ + assert_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); +} +#endif /* CONFIG_SMP */ + +#if BITS_PER_LONG == 32 +# define WMULT_CONST (~0UL) +#else +# define WMULT_CONST (1UL << 32) +#endif + +#define WMULT_SHIFT 32 + +/* + * Shift right and round: + */ +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +/* + * delta *= weight / lw + */ +static unsigned long +calc_delta_mine(unsigned long delta_exec, unsigned long weight, + struct load_weight *lw) +{ + u64 tmp; + + if (!lw->inv_weight) { + if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + lw->inv_weight = 1; + else + lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) + / (lw->weight+1); + } + + tmp = (u64)delta_exec * weight; + /* + * Check whether we'd overflow the 64-bit multiplication: + */ + if (unlikely(tmp > WMULT_CONST)) + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); +} + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +static const int prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +static const u32 prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; + +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); + +/* + * runqueue iterator, to support SMP load-balancing between different + * scheduling classes, without having to expose their internal data + * structures to the load-balancing proper: + */ +struct rq_iterator { + void *arg; + struct task_struct *(*start)(void *); + struct task_struct *(*next)(void *); +}; + +#ifdef CONFIG_SMP +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + int *this_best_prio, struct rq_iterator *iterator); + +static int +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle, + struct rq_iterator *iterator); +#endif + +#ifdef CONFIG_CGROUP_CPUACCT +static void cpuacct_charge(struct task_struct *tsk, u64 cputime); +#else +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +#endif + +static inline void inc_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_add(&rq->load, load); +} + +static inline void dec_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_sub(&rq->load, load); +} + +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +typedef int (*tg_visitor)(struct task_group *, void *); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + */ +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + struct task_group *parent, *child; + int ret; + + rcu_read_lock(); + parent = &root_task_group; +down: + ret = (*down)(parent, data); + if (ret) + goto out_unlock; + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + ret = (*up)(parent, data); + if (ret) + goto out_unlock; + + child = parent; + parent = parent->parent; + if (parent) + goto up; +out_unlock: + rcu_read_unlock(); + + return ret; +} + +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + + if (nr_running) + rq->avg_load_per_task = rq->load.weight / nr_running; + else + rq->avg_load_per_task = 0; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED + +static void __set_se_shares(struct sched_entity *se, unsigned long shares); + +/* + * Calculate and set the cpu's group shares. + */ +static void +update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, unsigned long sd_rq_weight) +{ + unsigned long shares; + unsigned long rq_weight; + + if (!tg->se[cpu]) + return; + + rq_weight = tg->cfs_rq[cpu]->rq_weight; + + /* + * \Sum shares * rq_weight + * shares = ----------------------- + * \Sum rq_weight + * + */ + shares = (sd_shares * rq_weight) / sd_rq_weight; + shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); + + if (abs(shares - tg->se[cpu]->load.weight) > + sysctl_sched_shares_thresh) { + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + tg->cfs_rq[cpu]->shares = shares; + + __set_se_shares(tg->se[cpu], shares); + spin_unlock_irqrestore(&rq->lock, flags); + } +} + +/* + * Re-compute the task group their per cpu shares over the given domain. + * This needs to be done in a bottom-up fashion because the rq weight of a + * parent group depends on the shares of its child groups. + */ +static int tg_shares_up(struct task_group *tg, void *data) +{ + unsigned long weight, rq_weight = 0; + unsigned long shares = 0; + struct sched_domain *sd = data; + int i; + + for_each_cpu(i, sched_domain_span(sd)) { + /* + * If there are currently no tasks on the cpu pretend there + * is one of average load so that when a new task gets to + * run here it will not get delayed by group starvation. + */ + weight = tg->cfs_rq[i]->load.weight; + if (!weight) + weight = NICE_0_LOAD; + + tg->cfs_rq[i]->rq_weight = weight; + rq_weight += weight; + shares += tg->cfs_rq[i]->shares; + } + + if ((!shares && rq_weight) || shares > tg->shares) + shares = tg->shares; + + if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) + shares = tg->shares; + + for_each_cpu(i, sched_domain_span(sd)) + update_group_shares_cpu(tg, i, shares, rq_weight); + + return 0; +} + +/* + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. + */ +static int tg_load_down(struct task_group *tg, void *data) +{ + unsigned long load; + long cpu = (long)data; + + if (!tg->parent) { + load = cpu_rq(cpu)->load.weight; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load *= tg->cfs_rq[cpu]->shares; + load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + } + + tg->cfs_rq[cpu]->h_load = load; + + return 0; +} + +static void update_shares(struct sched_domain *sd) +{ + u64 now = cpu_clock(raw_smp_processor_id()); + s64 elapsed = now - sd->last_update; + + if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { + sd->last_update = now; + walk_tg_tree(tg_nop, tg_shares_up, sd); + } +} + +static void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ + spin_unlock(&rq->lock); + update_shares(sd); + spin_lock(&rq->lock); +} + +static void update_h_load(long cpu) +{ + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); +} + +#else + +static inline void update_shares(struct sched_domain *sd) +{ +} + +static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ +} + +#endif + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); + ret = 1; + } else + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); + } + return ret; +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ +#ifdef CONFIG_SMP + cfs_rq->shares = shares; +#endif +} +#endif + +#include "sched_stats.h" +#include "sched_idletask.c" +#include "sched_fair.c" +#include "sched_rt.c" +#ifdef CONFIG_SCHED_DEBUG +# include "sched_debug.c" +#endif + +#define sched_class_highest (&rt_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) + +static void inc_nr_running(struct rq *rq) +{ + rq->nr_running++; +} + +static void dec_nr_running(struct rq *rq) +{ + rq->nr_running--; +} + +static void set_load_weight(struct task_struct *p) +{ + if (task_has_rt_policy(p)) { + p->se.load.weight = prio_to_weight[0] * 2; + p->se.load.inv_weight = prio_to_wmult[0] >> 1; + return; + } + + /* + * SCHED_IDLE tasks get minimal weight: + */ + if (p->policy == SCHED_IDLE) { + p->se.load.weight = WEIGHT_IDLEPRIO; + p->se.load.inv_weight = WMULT_IDLEPRIO; + return; + } + + p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; + p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; +} + +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + +static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) +{ + sched_info_queued(p); + p->sched_class->enqueue_task(rq, p, wakeup); + p->se.on_rq = 1; +} + +static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) +{ + if (sleep && p->se.last_wakeup) { + update_avg(&p->se.avg_overlap, + p->se.sum_exec_runtime - p->se.last_wakeup); + p->se.last_wakeup = 0; + } + + sched_info_dequeued(p); + p->sched_class->dequeue_task(rq, p, sleep); + p->se.on_rq = 0; +} + +/* + * __normal_prio - return the priority that is based on the static prio + */ +static inline int __normal_prio(struct task_struct *p) +{ + return p->static_prio; +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(struct task_struct *p) +{ + int prio; + + if (task_has_rt_policy(p)) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(struct task_struct *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/* + * activate_task - move a task to the runqueue. + */ +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; + + enqueue_task(rq, p, wakeup); + inc_nr_running(rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible++; + + dequeue_task(rq, p, sleep); + dec_nr_running(rq); +} + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +inline int task_curr(const struct task_struct *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} + +static inline void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio, int running) +{ + if (prev_class != p->sched_class) { + if (prev_class->switched_from) + prev_class->switched_from(rq, p, running); + p->sched_class->switched_to(rq, p, running); + } else + p->sched_class->prio_changed(rq, p, oldprio, running); +} + +#ifdef CONFIG_SMP + +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Is this task likely cache-hot: + */ +static int +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +{ + s64 delta; + + /* + * Buddy candidates are cache hot: + */ + if (sched_feat(CACHE_HOT_BUDDY) && + (&p->se == cfs_rq_of(&p->se)->next || + &p->se == cfs_rq_of(&p->se)->last)) + return 1; + + if (p->sched_class != &fair_sched_class) + return 0; + + if (sysctl_sched_migration_cost == -1) + return 1; + if (sysctl_sched_migration_cost == 0) + return 0; + + delta = now - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +} + + +void set_task_cpu(struct task_struct *p, unsigned int new_cpu) +{ + int old_cpu = task_cpu(p); + struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); + struct cfs_rq *old_cfsrq = task_cfs_rq(p), + *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); + u64 clock_offset; + + clock_offset = old_rq->clock - new_rq->clock; + + trace_sched_migrate_task(p, task_cpu(p), new_cpu); + +#ifdef CONFIG_SCHEDSTATS + if (p->se.wait_start) + p->se.wait_start -= clock_offset; + if (p->se.sleep_start) + p->se.sleep_start -= clock_offset; + if (p->se.block_start) + p->se.block_start -= clock_offset; + if (old_cpu != new_cpu) { + schedstat_inc(p, se.nr_migrations); + if (task_hot(p, old_rq->clock, NULL)) + schedstat_inc(p, se.nr_forced2_migrations); + } +#endif + p->se.vruntime -= old_cfsrq->min_vruntime - + new_cfsrq->min_vruntime; + + __set_task_cpu(p, new_cpu); +} + +struct migration_req { + struct list_head list; + + struct task_struct *task; + int dest_cpu; + + struct completion done; +}; + +/* + * The task's runqueue lock must be held. + * Returns true if you have to wait for migration thread. + */ +static int +migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) +{ + struct rq *rq = task_rq(p); + + /* + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ + if (!p->se.on_rq && !task_running(rq, p)) { + set_task_cpu(p, dest_cpu); + return 0; + } + + init_completion(&req->done); + req->task = p; + req->dest_cpu = dest_cpu; + list_add(&req->list, &rq->migration_queue); + + return 1; +} + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, long match_state) +{ + unsigned long flags; + int running, on_rq; + unsigned long ncsw; + struct rq *rq; + + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_running()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; + cpu_relax(); + } + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &flags); + trace_sched_wait_task(rq, p); + running = task_running(rq, p); + on_rq = p->se.on_rq; + ncsw = 0; + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, &flags); + + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it wa still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(on_rq)) { + schedule_timeout_uninterruptible(1); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } + + return ncsw; +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + * + * NOTE: this function doesnt have to take the runqueue lock, + * because all it wants to ensure is that the remote task enters + * the kernel. If the IPI races and the task has been migrated + * to another CPU then no harm is done and the purpose has been + * achieved as well. + */ +void kick_process(struct task_struct *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +{ + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; + int imbalance = 100 + (sd->imbalance_pct-100)/2; + + do { + unsigned long load, avg_load; + int local_group; + int i; + + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu(i, sched_group_cpus(group)) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + + avg_load += load; + } + + /* Adjust by relative CPU power of the group */ + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); + + if (local_group) { + this_load = avg_load; + this = group; + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + } while (group = group->next, group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; +} + +/* + * find_idlest_cpu - find the idlest cpu among the cpus in group. + */ +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +{ + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + /* Traverse only the allowed CPUs */ + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + load = weighted_cpuload(i); + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; + } + } + + return idlest; +} + +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int sched_balance_self(int cpu, int flag) +{ + struct task_struct *t = current; + struct sched_domain *tmp, *sd = NULL; + + for_each_domain(cpu, tmp) { + /* + * If power savings logic is enabled for a domain, stop there. + */ + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + break; + if (tmp->flags & flag) + sd = tmp; + } + + if (sd) + update_shares(sd); + + while (sd) { + struct sched_group *group; + int new_cpu, weight; + + if (!(sd->flags & flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, t, cpu); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_cpu(group, t, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= cpumask_weight(sched_domain_span(tmp))) + break; + if (tmp->flags & flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return cpu; +} + +#endif /* CONFIG_SMP */ + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. + */ +static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +{ + int cpu, orig_cpu, this_cpu, success = 0; + unsigned long flags; + long old_state; + struct rq *rq; + + if (!sched_feat(SYNC_WAKEUPS)) + sync = 0; + +#ifdef CONFIG_SMP + if (sched_feat(LB_WAKEUP_UPDATE)) { + struct sched_domain *sd; + + this_cpu = raw_smp_processor_id(); + cpu = task_cpu(p); + + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + update_shares(sd); + break; + } + } + } +#endif + + smp_wmb(); + rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (p->se.on_rq) + goto out_running; + + cpu = task_cpu(p); + orig_cpu = cpu; + this_cpu = smp_processor_id(); + +#ifdef CONFIG_SMP + if (unlikely(task_running(rq, p))) + goto out_activate; + + cpu = p->sched_class->select_task_rq(p, sync); + if (cpu != orig_cpu) { + set_task_cpu(p, cpu); + task_rq_unlock(rq, &flags); + /* might preempt at this point */ + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + if (p->se.on_rq) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + } + +#ifdef CONFIG_SCHEDSTATS + schedstat_inc(rq, ttwu_count); + if (cpu == this_cpu) + schedstat_inc(rq, ttwu_local); + else { + struct sched_domain *sd; + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + } +#endif /* CONFIG_SCHEDSTATS */ + +out_activate: +#endif /* CONFIG_SMP */ + schedstat_inc(p, se.nr_wakeups); + if (sync) + schedstat_inc(p, se.nr_wakeups_sync); + if (orig_cpu != cpu) + schedstat_inc(p, se.nr_wakeups_migrate); + if (cpu == this_cpu) + schedstat_inc(p, se.nr_wakeups_local); + else + schedstat_inc(p, se.nr_wakeups_remote); + activate_task(rq, p, 1); + success = 1; + +out_running: + trace_sched_wakeup(rq, p, success); + check_preempt_curr(rq, p, sync); + + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_wake_up) + p->sched_class->task_wake_up(rq, p); +#endif +out: + current->se.last_wakeup = current->se.sum_exec_runtime; + + task_rq_unlock(rq, &flags); + + return success; +} +#endif /* !DDE_LINUX */ + +int wake_up_process(struct task_struct *p) +{ + return try_to_wake_up(p, TASK_ALL, 0); +} +EXPORT_SYMBOL(wake_up_process); + +int wake_up_state(struct task_struct *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0); +} + +#ifndef DDE_LINUX +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + * + * __sched_fork() is basic setup used by init_idle() too: + */ +static void __sched_fork(struct task_struct *p) +{ + p->se.exec_start = 0; + p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; + p->se.last_wakeup = 0; + p->se.avg_overlap = 0; + +#ifdef CONFIG_SCHEDSTATS + p->se.wait_start = 0; + p->se.sum_sleep_runtime = 0; + p->se.sleep_start = 0; + p->se.block_start = 0; + p->se.sleep_max = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.slice_max = 0; + p->se.wait_max = 0; +#endif + + INIT_LIST_HEAD(&p->rt.run_list); + p->se.on_rq = 0; + INIT_LIST_HEAD(&p->se.group_node); + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif + + /* + * We mark the process as running here, but have not actually + * inserted it onto the runqueue yet. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; +} + +/* + * fork()/clone()-time setup: + */ +void sched_fork(struct task_struct *p, int clone_flags) +{ + int cpu = get_cpu(); + + __sched_fork(p); + +#ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif + set_task_cpu(p, cpu); + + /* + * Make sure we do not leak PI boosting priority to the child: + */ + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) + p->sched_class = &fair_sched_class; + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (likely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + p->oncpu = 0; +#endif +#ifdef CONFIG_PREEMPT + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; +#endif + put_cpu(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +{ + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + BUG_ON(p->state != TASK_RUNNING); + update_rq_clock(rq); + + p->prio = effective_prio(p); + + if (!p->sched_class->task_new || !current->se.on_rq) { + activate_task(rq, p, 0); + } else { + /* + * Let the scheduling class do new task startup + * management (if any): + */ + p->sched_class->task_new(rq, p); + inc_nr_running(rq); + } + trace_sched_wakeup_new(rq, p, 1); + check_preempt_curr(rq, p, 0); +#ifdef CONFIG_SMP + if (p->sched_class->task_wake_up) + p->sched_class->task_wake_up(rq, p); +#endif + task_rq_unlock(rq, &flags); +} + +#ifdef CONFIG_PREEMPT_NOTIFIERS + +/** + * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * @notifier: notifier struct to register + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); +} +EXPORT_SYMBOL_GPL(preempt_notifier_register); + +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister + * + * This is safe to call from within a preemption notifier. + */ +void preempt_notifier_unregister(struct preempt_notifier *notifier) +{ + hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_out(notifier, next); +} + +#else /* !CONFIG_PREEMPT_NOTIFIERS */ + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ +} + +#endif /* CONFIG_PREEMPT_NOTIFIERS */ + +/** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @prev: the current task that is being switched out + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +{ + fire_sched_out_preempt_notifiers(prev, next); + prepare_lock_switch(rq, next); + prepare_arch_switch(next); +} + +/** + * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch + * @prev: the thread we just switched away from. + * + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static void finish_task_switch(struct rq *rq, struct task_struct *prev) + __releases(rq->lock) +{ + struct mm_struct *mm = rq->prev_mm; + long prev_state; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets TASK_DEAD in tsk->state and calls + * schedule one last time. The schedule call will never return, and + * the scheduled task must drop that reference. + * The test for TASK_DEAD must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul <manfred@colorfullife.com> + */ + prev_state = prev->state; + finish_arch_switch(prev); + finish_lock_switch(rq, prev); +#ifdef CONFIG_SMP + if (current->sched_class->post_schedule) + current->sched_class->post_schedule(rq); +#endif + + fire_sched_in_preempt_notifiers(current); + if (mm) + mmdrop(mm); + if (unlikely(prev_state == TASK_DEAD)) { + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); + put_task_struct(prev); + } +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(struct task_struct *prev) + __releases(rq->lock) +{ + struct rq *rq = this_rq(); + + finish_task_switch(rq, prev); +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); +#endif + if (current->set_child_tid) + put_user(task_pid_vnr(current), current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline void +context_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +{ + struct mm_struct *mm, *oldmm; + + prepare_task_switch(rq, prev, next); + trace_sched_switch(rq, prev, next); + mm = next->mm; + oldmm = prev->active_mm; + /* + * For paravirt, this is coupled with an exit in switch_to to + * combine the page table reload and the switch backend into + * one hypercall. + */ + arch_enter_lazy_cpu_mode(); + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + rq->prev_mm = oldmm; + } + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#endif + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + barrier(); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_possible_cpu(i) + sum += cpu_rq(i)->nr_uninterruptible; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +unsigned long long nr_context_switches(void) +{ + int i; + unsigned long long sum = 0; + + for_each_possible_cpu(i) + sum += cpu_rq(i)->nr_switches; + + return sum; +} + +unsigned long nr_iowait(void) +{ + unsigned long i, sum = 0; + + for_each_possible_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); + + return sum; +} + +unsigned long nr_active(void) +{ + unsigned long i, running = 0, uninterruptible = 0; + + for_each_online_cpu(i) { + running += cpu_rq(i)->nr_running; + uninterruptible += cpu_rq(i)->nr_uninterruptible; + } + + if (unlikely((long)uninterruptible < 0)) + uninterruptible = 0; + + return running + uninterruptible; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). + */ +static void update_cpu_load(struct rq *this_rq) +{ + unsigned long this_load = this_rq->load.weight; + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + } +} + +#ifdef CONFIG_SMP + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&rq2->lock); + spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } + } + update_rq_clock(rq1); + update_rq_clock(rq2); +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +/* + * If dest_cpu is allowed for this process, migrate the task to it. + * This is accomplished by forcing the cpu_allowed mask to only + * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * the cpu_allowed mask is restored. + */ +static void sched_migrate_task(struct task_struct *p, int dest_cpu) +{ + struct migration_req req; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) + || unlikely(!cpu_active(dest_cpu))) + goto out; + + /* force the process onto the specified CPU */ + if (migrate_task(p, dest_cpu, &req)) { + /* Need to wait for migration thread (might exit: take ref). */ + struct task_struct *mt = rq->migration_thread; + + get_task_struct(mt); + task_rq_unlock(rq, &flags); + wake_up_process(mt); + put_task_struct(mt); + wait_for_completion(&req.done); + + return; + } +out: + task_rq_unlock(rq, &flags); +} + +/* + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. + */ +void sched_exec(void) +{ + int new_cpu, this_cpu = get_cpu(); + new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); + put_cpu(); + if (new_cpu != this_cpu) + sched_migrate_task(current, new_cpu); +} + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) +{ + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + check_preempt_curr(this_rq, p, 0); +} + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { + schedstat_inc(p, se.nr_failed_migrations_affine); + return 0; + } + *all_pinned = 0; + + if (task_running(rq, p)) { + schedstat_inc(p, se.nr_failed_migrations_running); + return 0; + } + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + + if (!task_hot(p, rq->clock, sd) || + sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, rq->clock, sd)) { + schedstat_inc(sd, lb_hot_gained[idle]); + schedstat_inc(p, se.nr_forced_migrations); + } +#endif + return 1; + } + + if (task_hot(p, rq->clock, sd)) { + schedstat_inc(p, se.nr_failed_migrations_hot); + return 0; + } + return 1; +} + +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + int *this_best_prio, struct rq_iterator *iterator) +{ + int loops = 0, pulled = 0, pinned = 0; + struct task_struct *p; + long rem_load_move = max_load_move; + + if (max_load_move == 0) + goto out; + + pinned = 1; + + /* + * Start the load-balancing iterator: + */ + p = iterator->start(iterator->arg); +next: + if (!p || loops++ > sysctl_sched_nr_migrate) + goto out; + + if ((p->se.load.weight >> 1) > rem_load_move || + !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { + p = iterator->next(iterator->arg); + goto next; + } + + pull_task(busiest, p, this_rq, this_cpu); + pulled++; + rem_load_move -= p->se.load.weight; + + /* + * We only want to steal up to the prescribed amount of weighted load. + */ + if (rem_load_move > 0) { + if (p->prio < *this_best_prio) + *this_best_prio = p->prio; + p = iterator->next(iterator->arg); + goto next; + } +out: + /* + * Right now, this is one of only two places pull_task() is called, + * so we can safely collect pull_task() stats here rather than + * inside pull_task(). + */ + schedstat_add(sd, lb_gained[idle], pulled); + + if (all_pinned) + *all_pinned = pinned; + + return max_load_move - rem_load_move; +} + +/* + * move_tasks tries to move up to max_load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + const struct sched_class *class = sched_class_highest; + unsigned long total_load_moved = 0; + int this_best_prio = this_rq->curr->prio; + + do { + total_load_moved += + class->load_balance(this_rq, this_cpu, busiest, + max_load_move - total_load_moved, + sd, idle, all_pinned, &this_best_prio); + class = class->next; + + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) + break; + + } while (class && max_load_move > total_load_moved); + + return total_load_moved > 0; +} + +static int +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle, + struct rq_iterator *iterator) +{ + struct task_struct *p = iterator->start(iterator->arg); + int pinned = 0; + + while (p) { + if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { + pull_task(busiest, p, this_rq, this_cpu); + /* + * Right now, this is only the second place pull_task() + * is called, so we can safely collect pull_task() + * stats here rather than inside pull_task(). + */ + schedstat_inc(sd, lb_gained[idle]); + + return 1; + } + p = iterator->next(iterator->arg); + } + + return 0; +} + +/* + * move_one_task tries to move exactly one task from busiest to this_rq, as + * part of active balancing operations within "domain". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + const struct sched_class *class; + + for (class = sched_class_highest; class; class = class->next) + if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) + return 1; + + return 0; +} + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the amount of weighted load which + * should be moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum cpu_idle_type idle, + int *sd_idle, const struct cpumask *cpus, int *balance) +{ + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; + unsigned long max_pull; + unsigned long busiest_load_per_task, busiest_nr_running; + unsigned long this_load_per_task, this_nr_running; + int load_idx, group_imb = 0; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + int power_savings_balance = 1; + unsigned long leader_nr_running = 0, min_load_per_task = 0; + unsigned long min_nr_running = ULONG_MAX; + struct sched_group *group_min = NULL, *group_leader = NULL; +#endif + + max_load = this_load = total_load = total_pwr = 0; + busiest_load_per_task = busiest_nr_running = 0; + this_load_per_task = this_nr_running = 0; + + if (idle == CPU_NOT_IDLE) + load_idx = sd->busy_idx; + else if (idle == CPU_NEWLY_IDLE) + load_idx = sd->newidle_idx; + else + load_idx = sd->idle_idx; + + do { + unsigned long load, group_capacity, max_cpu_load, min_cpu_load; + int local_group; + int i; + int __group_imb = 0; + unsigned int balance_cpu = -1, first_idle_cpu = 0; + unsigned long sum_nr_running, sum_weighted_load; + unsigned long sum_avg_load_per_task; + unsigned long avg_load_per_task; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); + + if (local_group) + balance_cpu = cpumask_first(sched_group_cpus(group)); + + /* Tally up the load of all CPUs in the group */ + sum_weighted_load = sum_nr_running = avg_load = 0; + sum_avg_load_per_task = avg_load_per_task = 0; + + max_cpu_load = 0; + min_cpu_load = ~0UL; + + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); + + if (*sd_idle && rq->nr_running) + *sd_idle = 0; + + /* Bias balancing toward cpus of our domain */ + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + + load = target_load(i, load_idx); + } else { + load = source_load(i, load_idx); + if (load > max_cpu_load) + max_cpu_load = load; + if (min_cpu_load > load) + min_cpu_load = load; + } + + avg_load += load; + sum_nr_running += rq->nr_running; + sum_weighted_load += weighted_cpuload(i); + + sum_avg_load_per_task += cpu_avg_load_per_task(i); + } + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (idle != CPU_NEWLY_IDLE && local_group && + balance_cpu != this_cpu && balance) { + *balance = 0; + goto ret; + } + + total_load += avg_load; + total_pwr += group->__cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); + + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of two tasks. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + avg_load_per_task = sg_div_cpu_power(group, + sum_avg_load_per_task * SCHED_LOAD_SCALE); + + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) + __group_imb = 1; + + group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; + + if (local_group) { + this_load = avg_load; + this = group; + this_nr_running = sum_nr_running; + this_load_per_task = sum_weighted_load; + } else if (avg_load > max_load && + (sum_nr_running > group_capacity || __group_imb)) { + max_load = avg_load; + busiest = group; + busiest_nr_running = sum_nr_running; + busiest_load_per_task = sum_weighted_load; + group_imb = __group_imb; + } + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == CPU_NOT_IDLE || + !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto group_next; + + /* + * If the local group is idle or completely loaded + * no need to do power savings balance at this domain + */ + if (local_group && (this_nr_running >= group_capacity || + !this_nr_running)) + power_savings_balance = 0; + + /* + * If a group is already running at full capacity or idle, + * don't include that group in power savings calculations + */ + if (!power_savings_balance || sum_nr_running >= group_capacity + || !sum_nr_running) + goto group_next; + + /* + * Calculate the group which has the least non-idle load. + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sum_nr_running < min_nr_running) || + (sum_nr_running == min_nr_running && + cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group_min)))) { + group_min = group; + min_nr_running = sum_nr_running; + min_load_per_task = sum_weighted_load / + sum_nr_running; + } + + /* + * Calculate the group which is almost near its + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sum_nr_running <= group_capacity - 1) { + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group_leader)))) { + group_leader = group; + leader_nr_running = sum_nr_running; + } + } +group_next: +#endif + group = group->next; + } while (group != sd->groups); + + if (!busiest || this_load >= max_load || busiest_nr_running == 0) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if (this_load >= avg_load || + 100*max_load <= sd->imbalance_pct*this_load) + goto out_balanced; + + busiest_load_per_task /= busiest_nr_running; + if (group_imb) + busiest_load_per_task = min(busiest_load_per_task, avg_load); + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + if (max_load <= busiest_load_per_task) + goto out_balanced; + + /* + * In the presence of smp nice balancing, certain scenarios can have + * max load less than avg load(as we skip the groups at or below + * its cpu_power, while calculating max_load..) + */ + if (max_load < avg_load) { + *imbalance = 0; + goto small_imbalance; + } + + /* Don't want to pull so many tasks that a group would go idle */ + max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); + + /* How much load to actually move to equalise the imbalance */ + *imbalance = min(max_pull * busiest->__cpu_power, + (avg_load - this_load) * this->__cpu_power) + / SCHED_LOAD_SCALE; + + /* + * if *imbalance is less than the average load per runnable task + * there is no gaurantee that any tasks will be moved so we'll have + * a think about bumping its value to force at least one task to be + * moved + */ + if (*imbalance < busiest_load_per_task) { + unsigned long tmp, pwr_now, pwr_move; + unsigned int imbn; + +small_imbalance: + pwr_move = pwr_now = 0; + imbn = 2; + if (this_nr_running) { + this_load_per_task /= this_nr_running; + if (busiest_load_per_task > this_load_per_task) + imbn = 1; + } else + this_load_per_task = cpu_avg_load_per_task(this_cpu); + + if (max_load - this_load + busiest_load_per_task >= + busiest_load_per_task * imbn) { + *imbalance = busiest_load_per_task; + return busiest; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->__cpu_power * + min(busiest_load_per_task, max_load); + pwr_now += this->__cpu_power * + min(this_load_per_task, this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = sg_div_cpu_power(busiest, + busiest_load_per_task * SCHED_LOAD_SCALE); + if (max_load > tmp) + pwr_move += busiest->__cpu_power * + min(busiest_load_per_task, max_load - tmp); + + /* Amount of load we'd add */ + if (max_load * busiest->__cpu_power < + busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = sg_div_cpu_power(this, + max_load * busiest->__cpu_power); + else + tmp = sg_div_cpu_power(this, + busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += this->__cpu_power * + min(this_load_per_task, this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain throughput */ + if (pwr_move > pwr_now) + *imbalance = busiest_load_per_task; + } + + return busiest; + +out_balanced: +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto ret; + + if (this == group_leader && group_leader != group_min) { + *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + cpumask_first(sched_group_cpus(group_leader)); + } + return group_min; + } +#endif +ret: + *imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static struct rq * +find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, + unsigned long imbalance, const struct cpumask *cpus) +{ + struct rq *busiest = NULL, *rq; + unsigned long max_load = 0; + int i; + + for_each_cpu(i, sched_group_cpus(group)) { + unsigned long wl; + + if (!cpumask_test_cpu(i, cpus)) + continue; + + rq = cpu_rq(i); + wl = weighted_cpuload(i); + + if (rq->nr_running == 1 && wl > imbalance) + continue; + + if (wl > max_load) { + max_load = wl; + busiest = rq; + } + } + + return busiest; +} + +/* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. + */ +#define MAX_PINNED_INTERVAL 512 + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + */ +static int load_balance(int this_cpu, struct rq *this_rq, + struct sched_domain *sd, enum cpu_idle_type idle, + int *balance, struct cpumask *cpus) +{ + int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + struct sched_group *group; + unsigned long imbalance; + struct rq *busiest; + unsigned long flags; + + cpumask_setall(cpus); + + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as CPU_IDLE, instead of + * portraying it as CPU_NOT_IDLE. + */ + if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + sd_idle = 1; + + schedstat_inc(sd, lb_count[idle]); + +redo: + update_shares(sd); + group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, + cpus, balance); + + if (*balance == 0) + goto out_balanced; + + if (!group) { + schedstat_inc(sd, lb_nobusyg[idle]); + goto out_balanced; + } + + busiest = find_busiest_queue(group, idle, imbalance, cpus); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[idle]); + goto out_balanced; + } + + BUG_ON(busiest == this_rq); + + schedstat_add(sd, lb_imbalance[idle], imbalance); + + ld_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. ld_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + local_irq_save(flags); + double_rq_lock(this_rq, busiest); + ld_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, idle, &all_pinned); + double_rq_unlock(this_rq, busiest); + local_irq_restore(flags); + + /* + * some other cpu did the load balance for us. + */ + if (ld_moved && this_cpu != smp_processor_id()) + resched_cpu(this_cpu); + + /* All tasks on this runqueue were pinned by CPU affinity */ + if (unlikely(all_pinned)) { + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) + goto redo; + goto out_balanced; + } + } + + if (!ld_moved) { + schedstat_inc(sd, lb_failed[idle]); + sd->nr_balance_failed++; + + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + + spin_lock_irqsave(&busiest->lock, flags); + + /* don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpumask_test_cpu(this_cpu, + &busiest->curr->cpus_allowed)) { + spin_unlock_irqrestore(&busiest->lock, flags); + all_pinned = 1; + goto out_one_pinned; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + spin_unlock_irqrestore(&busiest->lock, flags); + if (active_balance) + wake_up_process(busiest->migration_thread); + + /* + * We've kicked active balancing, reset the failure + * counter. + */ + sd->nr_balance_failed = sd->cache_nice_tries+1; + } + } else + sd->nr_balance_failed = 0; + + if (likely(!active_balance)) { + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + } else { + /* + * If we've begun active balancing, start to back off. This + * case may not be covered by the all_pinned logic if there + * is only 1 task on the busy runqueue (because we don't call + * move_tasks). + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + } + + if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + ld_moved = -1; + + goto out; + +out_balanced: + schedstat_inc(sd, lb_balanced[idle]); + + sd->nr_balance_failed = 0; + +out_one_pinned: + /* tune up the balancing interval */ + if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || + (sd->balance_interval < sd->max_interval)) + sd->balance_interval *= 2; + + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + ld_moved = -1; + else + ld_moved = 0; +out: + if (ld_moved) + update_shares(sd); + return ld_moved; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). + * this_rq is locked. + */ +static int +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, + struct cpumask *cpus) +{ + struct sched_group *group; + struct rq *busiest = NULL; + unsigned long imbalance; + int ld_moved = 0; + int sd_idle = 0; + int all_pinned = 0; + + cpumask_setall(cpus); + + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as CPU_NOT_IDLE. + */ + if (sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + sd_idle = 1; + + schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); +redo: + update_shares_locked(this_rq, sd); + group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, + &sd_idle, cpus, NULL); + if (!group) { + schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); + goto out_balanced; + } + + busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); + goto out_balanced; + } + + BUG_ON(busiest == this_rq); + + schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); + + ld_moved = 0; + if (busiest->nr_running > 1) { + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + /* this_rq->clock is already updated */ + update_rq_clock(busiest); + ld_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, CPU_NEWLY_IDLE, + &all_pinned); + double_unlock_balance(this_rq, busiest); + + if (unlikely(all_pinned)) { + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) + goto redo; + } + } + + if (!ld_moved) { + int active_balance = 0; + + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + return -1; + + if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) + return -1; + + if (sd->nr_balance_failed++ < 2) + return -1; + + /* + * The only task running in a non-idle cpu can be moved to this + * cpu in an attempt to completely freeup the other CPU + * package. The same method used to move task in load_balance() + * have been extended for load_balance_newidle() to speedup + * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) + * + * The package power saving logic comes from + * find_busiest_group(). If there are no imbalance, then + * f_b_g() will return NULL. However when sched_mc={1,2} then + * f_b_g() will select a group from which a running task may be + * pulled to this cpu in order to make the other package idle. + * If there is no opportunity to make a package idle and if + * there are no imbalance, then f_b_g() will return NULL and no + * action will be taken in load_balance_newidle(). + * + * Under normal task pull operation due to imbalance, there + * will be more than one task in the source run queue and + * move_tasks() will succeed. ld_moved will be true and this + * active balance code will not be triggered. + */ + + /* Lock busiest in correct order while this_rq is held */ + double_lock_balance(this_rq, busiest); + + /* + * don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { + double_unlock_balance(this_rq, busiest); + all_pinned = 1; + return ld_moved; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + + double_unlock_balance(this_rq, busiest); + /* + * Should not call ttwu while holding a rq->lock + */ + spin_unlock(&this_rq->lock); + if (active_balance) + wake_up_process(busiest->migration_thread); + spin_lock(&this_rq->lock); + + } else + sd->nr_balance_failed = 0; + + update_shares_locked(this_rq, sd); + return ld_moved; + +out_balanced: + schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) + return -1; + sd->nr_balance_failed = 0; + + return 0; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static void idle_balance(int this_cpu, struct rq *this_rq) +{ + struct sched_domain *sd; + int pulled_task = 0; + unsigned long next_balance = jiffies + HZ; + cpumask_var_t tmpmask; + + if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) + return; + + for_each_domain(this_cpu, sd) { + unsigned long interval; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + if (sd->flags & SD_BALANCE_NEWIDLE) + /* If we've pulled tasks over stop searching: */ + pulled_task = load_balance_newidle(this_cpu, this_rq, + sd, tmpmask); + + interval = msecs_to_jiffies(sd->balance_interval); + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + if (pulled_task) + break; + } + if (pulled_task || time_after(jiffies, this_rq->next_balance)) { + /* + * We are going idle. next_balance may be set based on + * a busy processor. So reset next_balance. + */ + this_rq->next_balance = next_balance; + } + free_cpumask_var(tmpmask); +} + +/* + * active_load_balance is run by migration threads. It pushes running tasks + * off the busiest CPU onto idle CPUs. It requires at least 1 task to be + * running on each physical CPU where possible, and avoids physical / + * logical imbalances. + * + * Called with busiest_rq locked. + */ +static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) +{ + int target_cpu = busiest_rq->push_cpu; + struct sched_domain *sd; + struct rq *target_rq; + + /* Is there any task to move? */ + if (busiest_rq->nr_running <= 1) + return; + + target_rq = cpu_rq(target_cpu); + + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + update_rq_clock(busiest_rq); + update_rq_clock(target_rq); + + /* Search for an sd spanning us and the target CPU. */ + for_each_domain(target_cpu, sd) { + if ((sd->flags & SD_LOAD_BALANCE) && + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + schedstat_inc(sd, alb_count); + + if (move_one_task(target_rq, target_cpu, busiest_rq, + sd, CPU_IDLE)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + double_unlock_balance(busiest_rq, target_rq); +} + +#ifdef CONFIG_NO_HZ +static struct { + atomic_t load_balancer; + cpumask_var_t cpu_mask; +} nohz ____cacheline_aligned = { + .load_balancer = ATOMIC_INIT(-1), +}; + +/* + * This routine will try to nominate the ilb (idle load balancing) + * owner among the cpus whose ticks are stopped. ilb owner will do the idle + * load balancing on behalf of all those cpus. If all the cpus in the system + * go into this tickless mode, then there will be no ilb owner (as there is + * no need for one) and all the cpus will sleep till the next wakeup event + * arrives... + * + * For the ilb owner, tick is not stopped. And this tick will be used + * for idle load balancing. ilb owner will still be part of + * nohz.cpu_mask.. + * + * While stopping the tick, this cpu will become the ilb owner if there + * is no other owner. And will be the owner till that cpu becomes busy + * or if all cpus in the system stop their ticks at which point + * there is no need for ilb owner. + * + * When the ilb owner becomes busy, it nominates another owner, during the + * next busy scheduler_tick() + */ +int select_nohz_load_balancer(int stop_tick) +{ + int cpu = smp_processor_id(); + + if (stop_tick) { + cpu_rq(cpu)->in_nohz_recently = 1; + + if (!cpu_active(cpu)) { + if (atomic_read(&nohz.load_balancer) != cpu) + return 0; + + /* + * If we are going offline and still the leader, + * give up! + */ + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + + return 0; + } + + cpumask_set_cpu(cpu, nohz.cpu_mask); + + /* time for ilb owner also to sleep */ + if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { + if (atomic_read(&nohz.load_balancer) == cpu) + atomic_set(&nohz.load_balancer, -1); + return 0; + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) + return 1; + } else if (atomic_read(&nohz.load_balancer) == cpu) + return 1; + } else { + if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) + return 0; + + cpumask_clear_cpu(cpu, nohz.cpu_mask); + + if (atomic_read(&nohz.load_balancer) == cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + } + return 0; +} +#endif + +static DEFINE_SPINLOCK(balancing); + +/* + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ +static void rebalance_domains(int cpu, enum cpu_idle_type idle) +{ + int balance = 1; + struct rq *rq = cpu_rq(cpu); + unsigned long interval; + struct sched_domain *sd; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; + int need_serialize; + cpumask_var_t tmp; + + /* Fails alloc? Rebalancing probably not a priority right now. */ + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) + return; + + for_each_domain(cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + interval = sd->balance_interval; + if (idle != CPU_IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + if (unlikely(!interval)) + interval = 1; + if (interval > HZ*NR_CPUS/10) + interval = HZ*NR_CPUS/10; + + need_serialize = sd->flags & SD_SERIALIZE; + + if (need_serialize) { + if (!spin_trylock(&balancing)) + goto out; + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { + if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { + /* + * We've pulled tasks over so either we're no + * longer idle, or one of our SMT siblings is + * not idle. + */ + idle = CPU_NOT_IDLE; + } + sd->last_balance = jiffies; + } + if (need_serialize) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) { + next_balance = sd->last_balance + interval; + update_next_balance = 1; + } + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!balance) + break; + } + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; + + free_cpumask_var(tmp); +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * In CONFIG_NO_HZ case, the idle load balance owner will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static void run_rebalance_domains(struct softirq_action *h) +{ + int this_cpu = smp_processor_id(); + struct rq *this_rq = cpu_rq(this_cpu); + enum cpu_idle_type idle = this_rq->idle_at_tick ? + CPU_IDLE : CPU_NOT_IDLE; + + rebalance_domains(this_cpu, idle); + +#ifdef CONFIG_NO_HZ + /* + * If this cpu is the owner for idle load balancing, then do the + * balancing on behalf of the other idle cpus whose ticks are + * stopped. + */ + if (this_rq->idle_at_tick && + atomic_read(&nohz.load_balancer) == this_cpu) { + struct rq *rq; + int balance_cpu; + + for_each_cpu(balance_cpu, nohz.cpu_mask) { + if (balance_cpu == this_cpu) + continue; + + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) + break; + + rebalance_domains(balance_cpu, CPU_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; + } + } +#endif +} + +/* + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. + * + * In case of CONFIG_NO_HZ, this is the place where we nominate a new + * idle load balancing owner or decide to stop the periodic load balancing, + * if the whole system is idle. + */ +static inline void trigger_load_balance(struct rq *rq, int cpu) +{ +#ifdef CONFIG_NO_HZ + /* + * If we were in the nohz mode recently and busy at the current + * scheduler tick, then check if we need to nominate new idle + * load balancer. + */ + if (rq->in_nohz_recently && !rq->idle_at_tick) { + rq->in_nohz_recently = 0; + + if (atomic_read(&nohz.load_balancer) == cpu) { + cpumask_clear_cpu(cpu, nohz.cpu_mask); + atomic_set(&nohz.load_balancer, -1); + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* + * simple selection for now: Nominate the + * first cpu in the nohz list to be the next + * ilb owner. + * + * TBD: Traverse the sched domains and nominate + * the nearest cpu in the nohz.cpu_mask. + */ + int ilb = cpumask_first(nohz.cpu_mask); + + if (ilb < nr_cpu_ids) + resched_cpu(ilb); + } + } + + /* + * If this cpu is idle and doing idle load balancing for all the + * cpus with ticks stopped, is it time for that to stop? + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && + cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { + resched_cpu(cpu); + return; + } + + /* + * If this cpu is idle and the idle load balancing is done by + * someone else, then no need raise the SCHED_SOFTIRQ + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && + cpumask_test_cpu(cpu, nohz.cpu_mask)) + return; +#endif + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); +} + +#else /* CONFIG_SMP */ + +/* + * on UP we do not need to balance between CPUs: + */ +static inline void idle_balance(int cpu, struct rq *rq) +{ +} + +#endif + +DEFINE_PER_CPU(struct kernel_stat, kstat); + +EXPORT_PER_CPU_SYMBOL(kstat); + +/* + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. + */ +unsigned long long task_delta_exec(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 ns = 0; + + rq = task_rq_lock(p, &flags); + + if (task_current(rq, p)) { + u64 delta_exec; + + update_rq_clock(rq); + delta_exec = rq->clock - p->se.exec_start; + if ((s64)delta_exec > 0) + ns = delta_exec; + } + + task_rq_unlock(rq, &flags); + + return ns; +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + /* Add user time to process. */ + p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); + account_group_user_time(p, cputime); + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (TASK_NICE(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); + /* Account for user time used */ + acct_update_integrals(p); +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + cputime64_t tmp; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + tmp = cputime_to_cputime64(cputime); + + /* Add guest time to process. */ + p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); + account_group_user_time(p, cputime); + p->gtime = cputime_add(p->gtime, cputime); + + /* Add guest time to cpustat. */ + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime, cputime_t cputime_scaled) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime, cputime_scaled); + return; + } + + /* Add system time to process. */ + p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else + cpustat->system = cputime64_add(cpustat->system, tmp); + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* + * Account for involuntary wait time. + * @steal: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy = jiffies_to_cputime(1); + cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + struct rq *rq = this_rq(); + + if (user_tick) + account_user_time(p, one_jiffy, one_jiffy_scaled); + else if (p != rq->idle) + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + one_jiffy_scaled); + else + account_idle_time(one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +cputime_t task_utime(struct task_struct *p) +{ + return p->utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + return p->stime; +} +#else +cputime_t task_utime(struct task_struct *p) +{ + clock_t utime = cputime_to_clock_t(p->utime), + total = utime + cputime_to_clock_t(p->stime); + u64 temp; + + /* + * Use CFS's precise accounting: + */ + temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + + if (total) { + temp *= utime; + do_div(temp, total); + } + utime = (clock_t)temp; + + p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + return p->prev_utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + clock_t stime; + + /* + * Use CFS's precise accounting. (we subtract utime from + * the total, to make sure the total observed by userspace + * grows monotonically - apps rely on that): + */ + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - + cputime_to_clock_t(task_utime(p)); + + if (stime >= 0) + p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + + return p->prev_stime; +} +#endif + +inline cputime_t task_gtime(struct task_struct *p) +{ + return p->gtime; +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct task_struct *curr = rq->curr; + + sched_clock_tick(); + + spin_lock(&rq->lock); + update_rq_clock(rq); + update_cpu_load(rq); + curr->sched_class->task_tick(rq, curr, 0); + spin_unlock(&rq->lock); + +#ifdef CONFIG_SMP + rq->idle_at_tick = idle_cpu(cpu); + trigger_load_balance(rq, cpu); +#endif +} + +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +static inline unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} + +void __kprobes add_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) + return; +#endif + preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Spinlock count overflowing soon? + */ + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +} +EXPORT_SYMBOL(add_preempt_count); + +void __kprobes sub_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) + return; + /* + * Is the spinlock portion underflowing? + */ + if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; +#endif + + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * Print scheduling while atomic bug: + */ +static noinline void __schedule_bug(struct task_struct *prev) +{ + struct pt_regs *regs = get_irq_regs(); + + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); + + debug_show_held_locks(prev); + print_modules(); + if (irqs_disabled()) + print_irqtrace_events(prev); + + if (regs) + show_regs(regs); + else + dump_stack(); +} + +/* + * Various schedule()-time debugging checks and statistics: + */ +static inline void schedule_debug(struct task_struct *prev) +{ + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) + __schedule_bug(prev); + + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + + schedstat_inc(this_rq(), sched_count); +#ifdef CONFIG_SCHEDSTATS + if (unlikely(prev->lock_depth >= 0)) { + schedstat_inc(this_rq(), bkl_count); + schedstat_inc(prev, sched_info.bkl_count); + } +#endif +} + +/* + * Pick up the highest-prio task: + */ +static inline struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev) +{ + const struct sched_class *class; + struct task_struct *p; + + /* + * Optimization: we know that if all tasks are in + * the fair class we can call that function directly: + */ + if (likely(rq->nr_running == rq->cfs.nr_running)) { + p = fair_sched_class.pick_next_task(rq); + if (likely(p)) + return p; + } + + class = sched_class_highest; + for ( ; ; ) { + p = class->pick_next_task(rq); + if (p) + return p; + /* + * Will never be NULL as the idle class always + * returns a non-NULL p: + */ + class = class->next; + } +} + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + struct task_struct *prev, *next; + unsigned long *switch_count; + struct rq *rq; + int cpu; + +need_resched: + preempt_disable(); + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + rcu_qsctr_inc(cpu); + prev = rq->curr; + switch_count = &prev->nivcsw; + + release_kernel_lock(prev); +need_resched_nonpreemptible: + + schedule_debug(prev); + + if (sched_feat(HRTICK)) + hrtick_clear(rq); + + spin_lock_irq(&rq->lock); + update_rq_clock(rq); + clear_tsk_need_resched(prev); + + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (unlikely(signal_pending_state(prev->state, prev))) + prev->state = TASK_RUNNING; + else + deactivate_task(rq, prev, 1); + switch_count = &prev->nvcsw; + } + +#ifdef CONFIG_SMP + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +#endif + + if (unlikely(!rq->nr_running)) + idle_balance(cpu, rq); + + prev->sched_class->put_prev_task(rq, prev); + next = pick_next_task(rq, prev); + + if (likely(prev != next)) { + sched_info_switch(prev, next); + + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + context_switch(rq, prev, next); /* unlocks the rq */ + /* + * the context switch might have flipped the stack from under + * us, hence refresh the local variables. + */ + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + } else + spin_unlock_irq(&rq->lock); + + if (unlikely(reacquire_kernel_lock(current) < 0)) + goto need_resched_nonpreemptible; + + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_PREEMPT +/* + * this is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); + + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (likely(ti->preempt_count || irqs_disabled())) + return; + + do { + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); +} +EXPORT_SYMBOL(preempt_schedule); + +/* + * this is the entry point to schedule() from kernel preemption + * off of irq context. + * Note, that this is called and return with irqs disabled. This will + * protect us against recursive calling from irq. + */ +asmlinkage void __sched preempt_schedule_irq(void) +{ + struct thread_info *ti = current_thread_info(); + + /* Catch callers which need to be fixed */ + BUG_ON(ti->preempt_count || !irqs_disabled()); + + do { + add_preempt_count(PREEMPT_ACTIVE); + local_irq_enable(); + schedule(); + local_irq_disable(); + sub_preempt_count(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); +} + +#endif /* CONFIG_PREEMPT */ +#endif /* !DDE_LINUX */ + +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, + void *key) +{ + return try_to_wake_up(curr->private, mode, sync); +} +EXPORT_SYMBOL(default_wake_function); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key) +{ + wait_queue_t *curr, *next; + + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + unsigned flags = curr->flags; + + if (curr->func(curr, mode, sync, key) && + (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 1, 0, NULL); +} + +/** + * __wake_up_sync - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + */ +void +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + int sync = 1; + + if (unlikely(!q)) + return; + + if (unlikely(!nr_exclusive)) + sync = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, sync, NULL); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + */ +void complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + */ +void complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, long timeout, int state) +{ + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + if (signal_pending_state(state, current)) { + timeout = -ERESTARTSYS; + break; + } + __set_current_state(state); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&x->wait.lock); + } while (!x->done && timeout); + __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; + } + x->done--; + return timeout ?: 1; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + timeout = do_wait_for_common(x, timeout, state); + spin_unlock_irq(&x->wait.lock); + return timeout; +} + +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + */ +unsigned long __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Returns: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Returns: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(completion_done); + +static long __sched +sleep_on_common(wait_queue_head_t *q, int state, long timeout) +{ + unsigned long flags; + wait_queue_t wait; + + init_waitqueue_entry(&wait, current); + + __set_current_state(state); + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, &wait); + spin_unlock(&q->lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&q->lock); + __remove_wait_queue(q, &wait); + spin_unlock_irqrestore(&q->lock, flags); + + return timeout; +} + +#ifndef DDE_LINUX +void __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(interruptible_sleep_on); + +long __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void __sched sleep_on(wait_queue_head_t *q) +{ + sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(sleep_on); + +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(sleep_on_timeout); + +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(struct task_struct *p, int prio) +{ + unsigned long flags; + int oldprio, on_rq, running; + struct rq *rq; + const struct sched_class *prev_class = p->sched_class; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + + oldprio = p->prio; + on_rq = p->se.on_rq; + running = task_current(rq, p); + if (on_rq) + dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + + p->prio = prio; + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) { + enqueue_task(rq, p, 0); + + check_class_changed(rq, p, prev_class, oldprio, running); + } + task_rq_unlock(rq, &flags); +} + +#endif + +void set_user_nice(struct task_struct *p, long nice) +{ + int old_prio, delta, on_rq; + unsigned long flags; + struct rq *rq; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * SCHED_FIFO/SCHED_RR: + */ + if (task_has_rt_policy(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + on_rq = p->se.on_rq; + if (on_rq) + dequeue_task(rq, p, 0); + + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; + + if (on_rq) { + enqueue_task(rq, p, 0); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, &flags); +} +EXPORT_SYMBOL(set_user_nice); + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ + /* convert nice value [19,-20] to rlimit style value [1,40] */ + int nice_rlim = 20 - nice; + + return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || + capable(CAP_SYS_NICE)); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +SYSCALL_DEFINE1(nice, int, increment) +{ + long nice, retval; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + if (increment < -40) + increment = -40; + if (increment > 40) + increment = 40; + + nice = PRIO_TO_NICE(current->static_prio) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + + if (increment < 0 && !can_nice(current, nice)) + return -EPERM; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(const struct task_struct *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(const struct task_struct *p) +{ + return TASK_NICE(p); +} +EXPORT_SYMBOL(task_nice); + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +/** + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. + */ +struct task_struct *idle_task(int cpu) +{ + return cpu_rq(cpu)->idle; +} + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static struct task_struct *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_vpid(pid) : current; +} + +/* Actually do priority change: must hold rq lock. */ +static void +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +{ + BUG_ON(p->se.on_rq); + + p->policy = policy; + switch (p->policy) { + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + p->sched_class = &fair_sched_class; + break; + case SCHED_FIFO: + case SCHED_RR: + p->sched_class = &rt_sched_class; + break; + } + + p->rt_priority = prio; + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); + set_load_weight(p); +} +#endif + +/* + * check the target process has a UID that matches the current process's + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + bool match; + + rcu_read_lock(); + pcred = __task_cred(p); + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + rcu_read_unlock(); + return match; +} + +static int __sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param, bool user) +{ +#ifndef DDE_LINUX + int retval, oldprio, oldpolicy = -1, on_rq, running; + unsigned long flags; + const struct sched_class *prev_class = p->sched_class; + struct rq *rq; + + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); +recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; + else if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH and SCHED_IDLE is 0. + */ + if (param->sched_priority < 0 || + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) + return -EINVAL; + if (rt_policy(policy) != (param->sched_priority != 0)) + return -EINVAL; + + /* + * Allow unprivileged RT tasks to decrease priority: + */ + if (user && !capable(CAP_SYS_NICE)) { + if (rt_policy(policy)) { + unsigned long rlim_rtprio; + + if (!lock_task_sighand(p, &flags)) + return -ESRCH; + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + unlock_task_sighand(p, &flags); + + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; + + /* can't increase priority */ + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; + } + /* + * Like positive nice levels, dont allow tasks to + * move out of SCHED_IDLE either: + */ + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) + return -EPERM; + + /* can't change other user's priorities */ + if (!check_same_owner(p)) + return -EPERM; + } + + if (user) { +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) + return -EPERM; +#endif + + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; + } + + /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + */ + spin_lock_irqsave(&p->pi_lock, flags); + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = __task_rq_lock(p); + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + goto recheck; + } + update_rq_clock(rq); + on_rq = p->se.on_rq; + running = task_current(rq, p); + if (on_rq) + deactivate_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + oldprio = p->prio; + __setscheduler(rq, p, policy, param->sched_priority); + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) { + activate_task(rq, p, 0); + + check_class_changed(rq, p, prev_class, oldprio, running); + } + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + + rt_mutex_adjust_pi(p); + + return 0; +#else + //WARN_UNIMPL; + return 0; +#endif +} + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, true); +} +EXPORT_SYMBOL_GPL(sched_setscheduler); + +#ifndef DDE_LINUX + +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, false); +} + +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lparam; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) + retval = sched_setscheduler(p, policy, &lparam); + rcu_read_unlock(); + + return retval; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, + struct sched_param __user *, param) +{ + /* negative values for policy are not valid */ + if (policy < 0) + return -EINVAL; + + return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) +{ + return do_sched_setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +{ + struct task_struct *p; + int retval; + + if (pid < 0) + return -EINVAL; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy; + } + read_unlock(&tasklist_lock); + return retval; +} + +/** + * sys_sched_getscheduler - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) +{ + struct sched_param lp; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + lp.sched_priority = p->rt_priority; + read_unlock(&tasklist_lock); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + + return retval; + +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + cpumask_var_t cpus_allowed, new_mask; + struct task_struct *p; + int retval; + + get_online_cpus(); + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + put_online_cpus(); + return -ESRCH; + } + + /* + * It is not safe to call set_cpus_allowed with the + * tasklist_lock held. We will bump the task_struct's + * usage count and then drop tasklist_lock. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + retval = -EPERM; + if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = security_task_setscheduler(p, 0, NULL); + if (retval) + goto out_unlock; + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); + again: + retval = set_cpus_allowed_ptr(p, new_mask); + + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } + } +out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: + put_task_struct(p); + put_online_cpus(); + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + struct cpumask *new_mask) +{ + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + cpumask_var_t new_mask; + int retval; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; +} + +long sched_getaffinity(pid_t pid, struct cpumask *mask) +{ + struct task_struct *p; + int retval; + + get_online_cpus(); + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + +out_unlock: + read_unlock(&tasklist_lock); + put_online_cpus(); + + return retval; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + int ret; + cpumask_var_t mask; + + if (len < cpumask_size()) + return -EINVAL; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + ret = -EFAULT; + else + ret = cpumask_size(); + } + free_cpumask_var(mask); + + return ret; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + */ +SYSCALL_DEFINE0(sched_yield) +{ + struct rq *rq = this_rq_lock(); + + schedstat_inc(rq, yld_count); + current->sched_class->yield_task(rq); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} +#endif /* !DDE_LINUX */ + +static void __cond_resched(void) +{ +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP + __might_sleep(__FILE__, __LINE__); +#endif + /* + * The BKS might be reacquired before we have dropped + * PREEMPT_ACTIVE, which could trigger a second + * cond_resched() call. + */ + do { + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + } while (need_resched()); +} + +int __sched _cond_resched(void) +{ + if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && + system_state == SYSTEM_RUNNING) { + __cond_resched(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(_cond_resched); + +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int cond_resched_lock(spinlock_t *lock) +{ + int resched = need_resched() && system_state == SYSTEM_RUNNING; + int ret = 0; + + if (spin_needbreak(lock) || resched) { + spin_unlock(lock); + if (resched && need_resched()) + __cond_resched(); + else + cpu_relax(); + ret = 1; + spin_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(cond_resched_lock); + +int __sched cond_resched_softirq(void) +{ + BUG_ON(!in_softirq()); + + if (need_resched() && system_state == SYSTEM_RUNNING) { + local_bh_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_softirq); + +#ifndef DDE_LINUX +/** + * yield - yield the current processor to other threads. + * + * This is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} +EXPORT_SYMBOL(yield); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +void __sched io_schedule(void) +{ + struct rq *rq = &__raw_get_cpu_var(runqueues); + + delayacct_blkio_start(); + atomic_inc(&rq->nr_iowait); + schedule(); + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); +} +EXPORT_SYMBOL(io_schedule); + +long __sched io_schedule_timeout(long timeout) +{ + struct rq *rq = &__raw_get_cpu_var(runqueues); + long ret; + + delayacct_blkio_start(); + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); + return ret; +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + } + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) +{ + struct task_struct *p; + unsigned int time_slice; + int retval; + struct timespec t; + + if (pid < 0) + return -EINVAL; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + /* + * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER + * tasks that are on an otherwise idle runqueue: + */ + time_slice = 0; + if (p->policy == SCHED_RR) { + time_slice = DEF_TIMESLICE; + } else if (p->policy != SCHED_FIFO) { + struct sched_entity *se = &p->se; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + if (rq->cfs.load.weight) + time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + task_rq_unlock(rq, &flags); + } + read_unlock(&tasklist_lock); + jiffies_to_timespec(time_slice, &t); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + return retval; + +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + +void sched_show_task(struct task_struct *p) +{ + unsigned long free = 0; + unsigned state; + + state = p->state ? __ffs(p->state) + 1 : 0; + printk(KERN_INFO "%-13.13s %c", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); +#if BITS_PER_LONG == 32 + if (state == TASK_RUNNING) + printk(KERN_CONT " running "); + else + printk(KERN_CONT " %08lx ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(KERN_CONT " running task "); + else + printk(KERN_CONT " %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + { + unsigned long *n = end_of_stack(p); + while (!*n) + n++; + free = (unsigned long)n - (unsigned long)end_of_stack(p); + } +#endif + printk(KERN_CONT "%5lu %5d %6d\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent)); + + show_stack(p, NULL); +} + +void show_state_filter(unsigned long state_filter) +{ + struct task_struct *g, *p; + +#if BITS_PER_LONG == 32 + printk(KERN_INFO + " task PC stack pid father\n"); +#else + printk(KERN_INFO + " task PC stack pid father\n"); +#endif + read_lock(&tasklist_lock); + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: + */ + touch_nmi_watchdog(); + if (!state_filter || (p->state & state_filter)) + sched_show_task(p); + } while_each_thread(g, p); + + touch_all_softlockup_watchdogs(); + +#ifdef CONFIG_SCHED_DEBUG + sysrq_sched_debug_show(); +#endif + read_unlock(&tasklist_lock); + /* + * Only show locks if all tasks are dumped: + */ + if (state_filter == -1) + debug_show_all_locks(); +} + +void __cpuinit init_idle_bootup_task(struct task_struct *idle) +{ + idle->sched_class = &idle_sched_class; +} + +/** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question + * @cpu: cpu the idle task belongs to + * + * NOTE: this function does not set the idle thread's NEED_RESCHED + * flag, to make booting more robust. + */ +void __cpuinit init_idle(struct task_struct *idle, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + + __sched_fork(idle); + idle->se.exec_start = sched_clock(); + + idle->prio = idle->normal_prio = MAX_PRIO; + cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + __set_task_cpu(idle, cpu); + + rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; +#endif + spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) + task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); +#else + task_thread_info(idle)->preempt_count = 0; +#endif + /* + * The idle tasks have their own, simple scheduling class: + */ + idle->sched_class = &idle_sched_class; + ftrace_graph_init_task(idle); +} +#endif /* DDE_LINUX */ + +/* + * In a system that switches off the HZ timer nohz_cpu_mask + * indicates which cpus entered this state. This is used + * in the rcu update to wait only for active cpus. For system + * which do not switch off the HZ timer nohz_cpu_mask should + * always be CPU_BITS_NONE. + */ +cpumask_var_t nohz_cpu_mask; + +#ifndef DDE_LINUX +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static inline void sched_init_granularity(void) +{ + unsigned int factor = 1 + ilog2(num_online_cpus()); + const unsigned long limit = 200000000; + + sysctl_sched_min_granularity *= factor; + if (sysctl_sched_min_granularity > limit) + sysctl_sched_min_granularity = limit; + + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; + + sysctl_sched_wakeup_granularity *= factor; + + sysctl_sched_shares_ratelimit *= factor; +} + +#ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we queue a struct migration_req structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. + */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + struct migration_req req; + unsigned long flags; + struct rq *rq; + int ret = 0; + + rq = task_rq_lock(p, &flags); + if (!cpumask_intersects(new_mask, cpu_online_mask)) { + ret = -EINVAL; + goto out; + } + + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && + !cpumask_equal(&p->cpus_allowed, new_mask))) { + ret = -EINVAL; + goto out; + } + + if (p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); + else { + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + } + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + tlb_migrate_finish(p->mm); + return 0; + } +out: + task_rq_unlock(rq, &flags); + + return ret; +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + * + * Returns non-zero if task was successfully migrated. + */ +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +{ + struct rq *rq_dest, *rq_src; + int ret = 0, on_rq; + + if (unlikely(!cpu_active(dest_cpu))) + return ret; + + rq_src = cpu_rq(src_cpu); + rq_dest = cpu_rq(dest_cpu); + + double_rq_lock(rq_src, rq_dest); + /* Already moved. */ + if (task_cpu(p) != src_cpu) + goto done; + /* Affinity changed (again). */ + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto fail; + + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq_src, p, 0); + + set_task_cpu(p, dest_cpu); + if (on_rq) { + activate_task(rq_dest, p, 0); + check_preempt_curr(rq_dest, p, 0); + } +done: + ret = 1; +fail: + double_rq_unlock(rq_src, rq_dest); + return ret; +} + +/* + * migration_thread - this is a highprio system thread that performs + * thread migration by bumping thread off CPU then 'pushing' onto + * another runqueue. + */ +static int migration_thread(void *data) +{ + int cpu = (long)data; + struct rq *rq; + + rq = cpu_rq(cpu); + BUG_ON(rq->migration_thread != current); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + struct migration_req *req; + struct list_head *head; + + spin_lock_irq(&rq->lock); + + if (cpu_is_offline(cpu)) { + spin_unlock_irq(&rq->lock); + goto wait_to_die; + } + + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + + head = &rq->migration_queue; + + if (list_empty(head)) { + spin_unlock_irq(&rq->lock); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + continue; + } + req = list_entry(head->next, struct migration_req, list); + list_del_init(head->next); + + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + local_irq_enable(); + + complete(&req->done); + } + __set_current_state(TASK_RUNNING); + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) +{ + int ret; + + local_irq_disable(); + ret = __migrate_task(p, src_cpu, dest_cpu); + local_irq_enable(); + return ret; +} + +/* + * Figure out where task on dead CPU should go, use force if necessary. + */ +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +{ + int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); + +again: + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto move; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + if (dest_cpu < nr_cpu_ids) + goto move; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); + } + } + +move: + /* It can have affinity changed while we were choosing. */ + if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) + goto again; +} + +/* + * While a dead CPU has no uninterruptible tasks queued at this point, + * it might still have a nonzero ->nr_uninterruptible counter, because + * for performance reasons the counter is not stricly tracking tasks to + * their home CPUs. So we just add the counter to another CPU's counter, + * to keep the global sum constant after CPU-down: + */ +static void migrate_nr_uninterruptible(struct rq *rq_src) +{ + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); + unsigned long flags; + + local_irq_save(flags); + double_rq_lock(rq_src, rq_dest); + rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; + rq_src->nr_uninterruptible = 0; + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); +} + +/* Run through task list and migrate tasks from the dead cpu. */ +static void migrate_live_tasks(int src_cpu) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + + do_each_thread(t, p) { + if (p == current) + continue; + + if (task_cpu(p) == src_cpu) + move_task_off_dead_cpu(src_cpu, p); + } while_each_thread(t, p); + + read_unlock(&tasklist_lock); +} + +/* + * Schedules idle task to be the next runnable task on current CPU. + * It does so by boosting its priority to highest possible. + * Used by CPU offline code. + */ +void sched_idle_next(void) +{ + int this_cpu = smp_processor_id(); + struct rq *rq = cpu_rq(this_cpu); + struct task_struct *p = rq->idle; + unsigned long flags; + + /* cpu has to be offline */ + BUG_ON(cpu_online(this_cpu)); + + /* + * Strictly not necessary since rest of the CPUs are stopped by now + * and interrupts disabled on the current cpu. + */ + spin_lock_irqsave(&rq->lock, flags); + + __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); + + update_rq_clock(rq); + activate_task(rq, p, 0); + + spin_unlock_irqrestore(&rq->lock, flags); +} + +/* + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. + */ +void idle_task_exit(void) +{ + struct mm_struct *mm = current->active_mm; + + BUG_ON(cpu_online(smp_processor_id())); + + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); +} + +/* called under rq->lock with disabled interrupts */ +static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) +{ + struct rq *rq = cpu_rq(dead_cpu); + + /* Must be exiting, otherwise would be on tasklist. */ + BUG_ON(!p->exit_state); + + /* Cannot have done final schedule yet: would have vanished. */ + BUG_ON(p->state == TASK_DEAD); + + get_task_struct(p); + + /* + * Drop lock around migration; if someone else moves it, + * that's OK. No task can be added to this CPU, so iteration is + * fine. + */ + spin_unlock_irq(&rq->lock); + move_task_off_dead_cpu(dead_cpu, p); + spin_lock_irq(&rq->lock); + + put_task_struct(p); +} + +/* release_task() removes task from tasklist, so we won't find dead tasks. */ +static void migrate_dead_tasks(unsigned int dead_cpu) +{ + struct rq *rq = cpu_rq(dead_cpu); + struct task_struct *next; + + for ( ; ; ) { + if (!rq->nr_running) + break; + update_rq_clock(rq); + next = pick_next_task(rq, rq->curr); + if (!next) + break; + next->sched_class->put_prev_task(rq, next); + migrate_dead(dead_cpu, next); + + } +} +#endif /* CONFIG_HOTPLUG_CPU */ + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {0, }, +}; + +static struct ctl_table sd_ctl_root[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {0, }, +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + + return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry; + + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { + if (entry->child) + sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } + + kfree(*tablep); + *tablep = NULL; +} + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + mode_t mode, proc_handler *proc_handler) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(13); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[9], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[10], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[12] is terminator */ + + return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void register_sched_domain_sysctl(void) +{ + int i, cpu_num = num_online_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + + if (entry == NULL) + return; + + for_each_online_cpu(i) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + entry++; + } + + WARN_ON(sd_sysctl_header); + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +static void unregister_sched_domain_sysctl(void) +{ + if (sd_sysctl_header) + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#else +static void register_sched_domain_sysctl(void) +{ +} +static void unregister_sched_domain_sysctl(void) +{ +} +#endif + +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + const struct sched_class *class; + + cpumask_set_cpu(rq->cpu, rq->rd->online); + rq->online = 1; + + for_each_class(class) { + if (class->rq_online) + class->rq_online(rq); + } + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + const struct sched_class *class; + + for_each_class(class) { + if (class->rq_offline) + class->rq_offline(rq); + } + + cpumask_clear_cpu(rq->cpu, rq->rd->online); + rq->online = 0; + } +} + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int __cpuinit +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + struct task_struct *p; + int cpu = (long)hcpu; + unsigned long flags; + struct rq *rq; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + kthread_bind(p, cpu); + /* Must be high prio: stop_machine expects to yield to it. */ + rq = task_rq_lock(p, &flags); + __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); + task_rq_unlock(rq, &flags); + cpu_rq(cpu)->migration_thread = p; + break; + + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + /* Strictly unnecessary, as first user will wake it. */ + wake_up_process(cpu_rq(cpu)->migration_thread); + + /* Update our root-domain */ + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + + set_rq_online(rq); + } + spin_unlock_irqrestore(&rq->lock, flags); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!cpu_rq(cpu)->migration_thread) + break; + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(cpu_rq(cpu)->migration_thread, + cpumask_any(cpu_online_mask)); + kthread_stop(cpu_rq(cpu)->migration_thread); + cpu_rq(cpu)->migration_thread = NULL; + break; + + case CPU_DEAD: + case CPU_DEAD_FROZEN: + cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ + migrate_live_tasks(cpu); + rq = cpu_rq(cpu); + kthread_stop(rq->migration_thread); + rq->migration_thread = NULL; + /* Idle task back to normal (off runqueue, low prio) */ + spin_lock_irq(&rq->lock); + update_rq_clock(rq); + deactivate_task(rq, rq->idle, 0); + rq->idle->static_prio = MAX_PRIO; + __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); + rq->idle->sched_class = &idle_sched_class; + migrate_dead_tasks(cpu); + spin_unlock_irq(&rq->lock); + cpuset_unlock(); + migrate_nr_uninterruptible(rq); + BUG_ON(rq->nr_running != 0); + + /* + * No need to migrate the tasks: it was best-effort if + * they didn't take sched_hotcpu_mutex. Just wake up + * the requestors. + */ + spin_lock_irq(&rq->lock); + while (!list_empty(&rq->migration_queue)) { + struct migration_req *req; + + req = list_entry(rq->migration_queue.next, + struct migration_req, list); + list_del_init(&req->list); + spin_unlock_irq(&rq->lock); + complete(&req->done); + spin_lock_irq(&rq->lock); + } + spin_unlock_irq(&rq->lock); + break; + + case CPU_DYING: + case CPU_DYING_FROZEN: + /* Update our root-domain */ + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + spin_unlock_irqrestore(&rq->lock, flags); + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __cpuinitdata migration_notifier = { + .notifier_call = migration_call, + .priority = 10 +}; + +static int __init migration_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err; + + /* Start one for the boot CPU: */ + err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + BUG_ON(err == NOTIFY_BAD); + migration_call(&migration_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&migration_notifier); + + return err; +} +early_initcall(migration_init); +#endif + +#ifdef CONFIG_SMP + +#ifdef CONFIG_SCHED_DEBUG + +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, + struct cpumask *groupmask) +{ + struct sched_group *group = sd->groups; + char str[256]; + + cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); + cpumask_clear(groupmask); + + printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); + return -1; + } + + printk(KERN_CONT "span %s level %s\n", str, sd->name); + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); + } + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); + } + + printk(KERN_DEBUG "%*s groups:", level + 1, ""); + do { + if (!group) { + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); + break; + } + + if (!group->__cpu_power) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); + break; + } + + if (!cpumask_weight(sched_group_cpus(group))) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: empty group\n"); + break; + } + + if (cpumask_intersects(groupmask, sched_group_cpus(group))) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); + break; + } + + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + + cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); + printk(KERN_CONT " %s", str); + + group = group->next; + } while (group != sd->groups); + printk(KERN_CONT "\n"); + + if (!cpumask_equal(sched_domain_span(sd), groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); + return 0; +} + +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + cpumask_var_t groupmask; + int level = 0; + + if (!sd) { + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); + return; + } + + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + + if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { + printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); + return; + } + + for (;;) { + if (sched_domain_debug_one(sd, cpu, level, groupmask)) + break; + level++; + sd = sd->parent; + if (!sd) + break; + } + free_cpumask_var(groupmask); +} +#else /* !CONFIG_SCHED_DEBUG */ +# define sched_domain_debug(sd, cpu) do { } while (0) +#endif /* CONFIG_SCHED_DEBUG */ + +static int sd_degenerate(struct sched_domain *sd) +{ + if (cpumask_weight(sched_domain_span(sd)) == 1) + return 1; + + /* Following flags need at least 2 groups */ + if (sd->flags & (SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES)) { + if (sd->groups != sd->groups->next) + return 0; + } + + /* Following flags don't use groups */ + if (sd->flags & (SD_WAKE_IDLE | + SD_WAKE_AFFINE | + SD_WAKE_BALANCE)) + return 0; + + return 1; +} + +static int +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) +{ + unsigned long cflags = sd->flags, pflags = parent->flags; + + if (sd_degenerate(parent)) + return 1; + + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) + return 0; + + /* Does parent contain flags not in child? */ + /* WAKE_BALANCE is a subset of WAKE_AFFINE */ + if (cflags & SD_WAKE_AFFINE) + pflags &= ~SD_WAKE_BALANCE; + /* Flags needing groups don't count if only 1 group in parent */ + if (parent->groups == parent->groups->next) { + pflags &= ~(SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES); + if (nr_node_ids == 1) + pflags &= ~SD_SERIALIZE; + } + if (~cflags & pflags) + return 0; + + return 1; +} + +static void free_rootdomain(struct root_domain *rd) +{ + cpupri_cleanup(&rd->cpupri); + + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + +static void rq_attach_root(struct rq *rq, struct root_domain *rd) +{ + struct root_domain *old_rd = NULL; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + old_rd = rq->rd; + + if (cpumask_test_cpu(rq->cpu, old_rd->online)) + set_rq_offline(rq); + + cpumask_clear_cpu(rq->cpu, old_rd->span); + + /* + * If we dont want to free the old_rt yet then + * set old_rd to NULL to skip the freeing later + * in this function: + */ + if (!atomic_dec_and_test(&old_rd->refcount)) + old_rd = NULL; + } + + atomic_inc(&rd->refcount); + rq->rd = rd; + + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) + set_rq_online(rq); + + spin_unlock_irqrestore(&rq->lock, flags); + + if (old_rd) + free_rootdomain(old_rd); +} + +static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) +{ + memset(rd, 0, sizeof(*rd)); + + if (bootmem) { + alloc_bootmem_cpumask_var(&def_root_domain.span); + alloc_bootmem_cpumask_var(&def_root_domain.online); + alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); + cpupri_init(&rd->cpupri, true); + return 0; + } + + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; + + if (cpupri_init(&rd->cpupri, false) != 0) + goto free_rto_mask; + return 0; + +free_rto_mask: + free_cpumask_var(rd->rto_mask); +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +out: + return -ENOMEM; +} + +static void init_defrootdomain(void) +{ + init_rootdomain(&def_root_domain, true); + + atomic_set(&def_root_domain.refcount, 1); +} + +static struct root_domain *alloc_rootdomain(void) +{ + struct root_domain *rd; + + rd = kmalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return NULL; + + if (init_rootdomain(rd, false) != 0) { + kfree(rd); + return NULL; + } + + return rd; +} + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +static void +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct sched_domain *tmp; + + /* Remove the sched domains which do not contribute to scheduling. */ + for (tmp = sd; tmp; ) { + struct sched_domain *parent = tmp->parent; + if (!parent) + break; + + if (sd_parent_degenerate(tmp, parent)) { + tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + } else + tmp = tmp->parent; + } + + if (sd && sd_degenerate(sd)) { + sd = sd->parent; + if (sd) + sd->child = NULL; + } + + sched_domain_debug(sd, cpu); + + rq_attach_root(rq, rd); + rcu_assign_pointer(rq->sd, sd); +} + +/* cpus with isolated domains */ +static cpumask_var_t cpu_isolated_map; + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ + cpulist_parse(str, cpu_isolated_map); + return 1; +} + +__setup("isolcpus=", isolated_cpu_setup); + +/* + * init_sched_build_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). + * + * init_sched_build_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + */ +static void +init_sched_build_groups(const struct cpumask *span, + const struct cpumask *cpu_map, + int (*group_fn)(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *tmpmask), + struct cpumask *covered, struct cpumask *tmpmask) +{ + struct sched_group *first = NULL, *last = NULL; + int i; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct sched_group *sg; + int group = group_fn(i, cpu_map, &sg, tmpmask); + int j; + + if (cpumask_test_cpu(i, covered)) + continue; + + cpumask_clear(sched_group_cpus(sg)); + sg->__cpu_power = 0; + + for_each_cpu(j, span) { + if (group_fn(j, cpu_map, NULL, tmpmask) != group) + continue; + + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); + } + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; +} + +#define SD_NODES_PER_DOMAIN 16 + +#ifdef CONFIG_NUMA + +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int find_next_best_node(int node, nodemask_t *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < nr_node_ids; i++) { + /* Start at @node */ + n = (node + i) % nr_node_ids; + + if (!nr_cpus_node(n)) + continue; + + /* Skip already used nodes */ + if (node_isset(n, *used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + node_set(best_node, *used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @span: resulting cpumask + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static void sched_domain_node_span(int node, struct cpumask *span) +{ + nodemask_t used_nodes; + int i; + + cpumask_clear(span); + nodes_clear(used_nodes); + + cpumask_or(span, span, cpumask_of_node(node)); + node_set(node, used_nodes); + + for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { + int next_node = find_next_best_node(node, &used_nodes); + + cpumask_or(span, span, cpumask_of_node(next_node)); + } +} +#endif /* CONFIG_NUMA */ + +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; + +/* + * The cpus mask in sched_group and sched_domain hangs off the end. + * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space + * for nr_cpu_ids < CONFIG_NR_CPUS. + */ +struct static_sched_group { + struct sched_group sg; + DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); +}; + +struct static_sched_domain { + struct sched_domain sd; + DECLARE_BITMAP(span, CONFIG_NR_CPUS); +}; + +/* + * SMT sched-domains: + */ +#ifdef CONFIG_SCHED_SMT +static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); + +static int +cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) +{ + if (sg) + *sg = &per_cpu(sched_group_cpus, cpu).sg; + return cpu; +} +#endif /* CONFIG_SCHED_SMT */ + +/* + * multi-core sched-domains: + */ +#ifdef CONFIG_SCHED_MC +static DEFINE_PER_CPU(struct static_sched_domain, core_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); +#endif /* CONFIG_SCHED_MC */ + +#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) +static int +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) +{ + int group; + + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); + if (sg) + *sg = &per_cpu(sched_group_core, group).sg; + return group; +} +#elif defined(CONFIG_SCHED_MC) +static int +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) +{ + if (sg) + *sg = &per_cpu(sched_group_core, cpu).sg; + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); + +static int +cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) +{ + int group; +#ifdef CONFIG_SCHED_MC + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); +#else + group = cpu; +#endif + if (sg) + *sg = &per_cpu(sched_group_phys, group).sg; + return group; +} + +#ifdef CONFIG_NUMA +/* + * The init_sched_build_groups can't handle what we want to do with node + * groups, so roll our own. Now each node has its own list of groups which + * gets dynamically allocated. + */ +static DEFINE_PER_CPU(struct static_sched_domain, node_domains); +static struct sched_group ***sched_group_nodes_bycpu; + +static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); + +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *nodemask) +{ + int group; + + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); + group = cpumask_first(nodemask); + + if (sg) + *sg = &per_cpu(sched_group_allnodes, group).sg; + return group; +} + +static void init_numa_sched_groups_power(struct sched_group *group_head) +{ + struct sched_group *sg = group_head; + int j; + + if (!sg) + return; + do { + for_each_cpu(j, sched_group_cpus(sg)) { + struct sched_domain *sd; + + sd = &per_cpu(phys_domains, j).sd; + if (j != cpumask_first(sched_group_cpus(sd->groups))) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + + sg_inc_cpu_power(sg, sd->groups->__cpu_power); + } + sg = sg->next; + } while (sg != group_head); +} +#endif /* CONFIG_NUMA */ + +#ifdef CONFIG_NUMA +/* Free memory allocated for various sched_group structures */ +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) +{ + int cpu, i; + + for_each_cpu(cpu, cpu_map) { + struct sched_group **sched_group_nodes + = sched_group_nodes_bycpu[cpu]; + + if (!sched_group_nodes) + continue; + + for (i = 0; i < nr_node_ids; i++) { + struct sched_group *oldsg, *sg = sched_group_nodes[i]; + + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(nodemask)) + continue; + + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + } + kfree(sched_group_nodes); + sched_group_nodes_bycpu[cpu] = NULL; + } +} +#else /* !CONFIG_NUMA */ +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) +{ +} +#endif /* CONFIG_NUMA */ + +/* + * Initialize sched groups cpu_power. + * + * cpu_power indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_power for all the groups in a sched domain will be same unless + * there are asymmetries in the topology. If there are asymmetries, group + * having more cpu_power will pickup more load compared to the group having + * less cpu_power. + * + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents + * the maximum number of tasks a group can handle in the presence of other idle + * or lightly loaded groups in the same sched domain. + */ +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ + struct sched_domain *child; + struct sched_group *group; + + WARN_ON(!sd || !sd->groups); + + if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + return; + + child = sd->child; + + sd->groups->__cpu_power = 0; + + /* + * For perf policy, if the groups in child domain share resources + * (for example cores sharing some portions of the cache hierarchy + * or SMT), then set this domain groups cpu_power such that each group + * can handle only one task, when there are other idle groups in the + * same sched domain. + */ + if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && + (child->flags & + (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { + sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); + return; + } + + /* + * add cpu_power of each child group to this groups cpu_power + */ + group = child->groups; + do { + sg_inc_cpu_power(sd->groups, group->__cpu_power); + group = group->next; + } while (group != child->groups); +} + +/* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + +#define SD_INIT(sd, type) sd_init_##type(sd) + +#define SD_INIT_FUNC(type) \ +static noinline void sd_init_##type(struct sched_domain *sd) \ +{ \ + memset(sd, 0, sizeof(*sd)); \ + *sd = SD_##type##_INIT; \ + sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ +} + +SD_INIT_FUNC(CPU) +#ifdef CONFIG_NUMA + SD_INIT_FUNC(ALLNODES) + SD_INIT_FUNC(NODE) +#endif +#ifdef CONFIG_SCHED_SMT + SD_INIT_FUNC(SIBLING) +#endif +#ifdef CONFIG_SCHED_MC + SD_INIT_FUNC(MC) +#endif + +static int default_relax_domain_level = -1; + +static int __init setup_relax_domain_level(char *str) +{ + unsigned long val; + + val = simple_strtoul(str, NULL, 0); + if (val < SD_LV_MAX) + default_relax_domain_level = val; + + return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, + struct sched_domain_attr *attr) +{ + int request; + + if (!attr || attr->relax_domain_level < 0) { + if (default_relax_domain_level < 0) + return; + else + request = default_relax_domain_level; + } else + request = attr->relax_domain_level; + if (request < sd->level) { + /* turn off idle balance on this domain */ + sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + } else { + /* turn on idle balance on this domain */ + sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + } +} + +/* + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus + */ +static int __build_sched_domains(const struct cpumask *cpu_map, + struct sched_domain_attr *attr) +{ + int i, err = -ENOMEM; + struct root_domain *rd; + cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, + tmpmask; +#ifdef CONFIG_NUMA + cpumask_var_t domainspan, covered, notcovered; + struct sched_group **sched_group_nodes = NULL; + int sd_allnodes = 0; + + if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + goto free_domainspan; + if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + goto free_covered; +#endif + + if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + goto free_notcovered; + if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + goto free_nodemask; + if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + goto free_this_sibling_map; + if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + goto free_this_core_map; + if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + goto free_send_covered; + +#ifdef CONFIG_NUMA + /* + * Allocate the per-node list of sched groups + */ + sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), + GFP_KERNEL); + if (!sched_group_nodes) { + printk(KERN_WARNING "Can not alloc sched group node list\n"); + goto free_tmpmask; + } +#endif + + rd = alloc_rootdomain(); + if (!rd) { + printk(KERN_WARNING "Cannot alloc root domain\n"); + goto free_sched_groups; + } + +#ifdef CONFIG_NUMA + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; +#endif + + /* + * Set up domains for cpus specified by the cpu_map. + */ + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = NULL, *p; + + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); + +#ifdef CONFIG_NUMA + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { + sd = &per_cpu(allnodes_domains, i).sd; + SD_INIT(sd, ALLNODES); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), cpu_map); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); + p = sd; + sd_allnodes = 1; + } else + p = NULL; + + sd = &per_cpu(node_domains, i).sd; + SD_INIT(sd, NODE); + set_domain_attribute(sd, attr); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); + sd->parent = p; + if (p) + p->child = sd; + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); +#endif + + p = sd; + sd = &per_cpu(phys_domains, i).sd; + SD_INIT(sd, CPU); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), nodemask); + sd->parent = p; + if (p) + p->child = sd; + cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); + +#ifdef CONFIG_SCHED_MC + p = sd; + sd = &per_cpu(core_domains, i).sd; + SD_INIT(sd, MC); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, + cpu_coregroup_mask(i)); + sd->parent = p; + p->child = sd; + cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); +#endif + +#ifdef CONFIG_SCHED_SMT + p = sd; + sd = &per_cpu(cpu_domains, i).sd; + SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), + &per_cpu(cpu_sibling_map, i), cpu_map); + sd->parent = p; + p->child = sd; + cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); +#endif + } + +#ifdef CONFIG_SCHED_SMT + /* Set up CPU (sibling) groups */ + for_each_cpu(i, cpu_map) { + cpumask_and(this_sibling_map, + &per_cpu(cpu_sibling_map, i), cpu_map); + if (i != cpumask_first(this_sibling_map)) + continue; + + init_sched_build_groups(this_sibling_map, cpu_map, + &cpu_to_cpu_group, + send_covered, tmpmask); + } +#endif + +#ifdef CONFIG_SCHED_MC + /* Set up multi-core groups */ + for_each_cpu(i, cpu_map) { + cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); + if (i != cpumask_first(this_core_map)) + continue; + + init_sched_build_groups(this_core_map, cpu_map, + &cpu_to_core_group, + send_covered, tmpmask); + } +#endif + + /* Set up physical groups */ + for (i = 0; i < nr_node_ids; i++) { + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(nodemask)) + continue; + + init_sched_build_groups(nodemask, cpu_map, + &cpu_to_phys_group, + send_covered, tmpmask); + } + +#ifdef CONFIG_NUMA + /* Set up node groups */ + if (sd_allnodes) { + init_sched_build_groups(cpu_map, cpu_map, + &cpu_to_allnodes_group, + send_covered, tmpmask); + } + + for (i = 0; i < nr_node_ids; i++) { + /* Set up node groups */ + struct sched_group *sg, *prev; + int j; + + cpumask_clear(covered); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(nodemask)) { + sched_group_nodes[i] = NULL; + continue; + } + + sched_domain_node_span(i, domainspan); + cpumask_and(domainspan, domainspan, cpu_map); + + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, i); + if (!sg) { + printk(KERN_WARNING "Can not alloc domain group for " + "node %d\n", i); + goto error; + } + sched_group_nodes[i] = sg; + for_each_cpu(j, nodemask) { + struct sched_domain *sd; + + sd = &per_cpu(node_domains, j).sd; + sd->groups = sg; + } + sg->__cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), nodemask); + sg->next = sg; + cpumask_or(covered, covered, nodemask); + prev = sg; + + for (j = 0; j < nr_node_ids; j++) { + int n = (i + j) % nr_node_ids; + + cpumask_complement(notcovered, covered); + cpumask_and(tmpmask, notcovered, cpu_map); + cpumask_and(tmpmask, tmpmask, domainspan); + if (cpumask_empty(tmpmask)) + break; + + cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); + if (cpumask_empty(tmpmask)) + continue; + + sg = kmalloc_node(sizeof(struct sched_group) + + cpumask_size(), + GFP_KERNEL, i); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + goto error; + } + sg->__cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), tmpmask); + sg->next = prev->next; + cpumask_or(covered, covered, tmpmask); + prev->next = sg; + prev = sg; + } + } +#endif + + /* Calculate CPU power for physical packages and nodes */ +#ifdef CONFIG_SCHED_SMT + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; + + init_sched_groups_power(i, sd); + } +#endif +#ifdef CONFIG_SCHED_MC + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(core_domains, i).sd; + + init_sched_groups_power(i, sd); + } +#endif + + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(phys_domains, i).sd; + + init_sched_groups_power(i, sd); + } + +#ifdef CONFIG_NUMA + for (i = 0; i < nr_node_ids; i++) + init_numa_sched_groups_power(sched_group_nodes[i]); + + if (sd_allnodes) { + struct sched_group *sg; + + cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, + tmpmask); + init_numa_sched_groups_power(sg); + } +#endif + + /* Attach the domains */ + for_each_cpu(i, cpu_map) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i).sd; +#elif defined(CONFIG_SCHED_MC) + sd = &per_cpu(core_domains, i).sd; +#else + sd = &per_cpu(phys_domains, i).sd; +#endif + cpu_attach_domain(sd, rd, i); + } + + err = 0; + +free_tmpmask: + free_cpumask_var(tmpmask); +free_send_covered: + free_cpumask_var(send_covered); +free_this_core_map: + free_cpumask_var(this_core_map); +free_this_sibling_map: + free_cpumask_var(this_sibling_map); +free_nodemask: + free_cpumask_var(nodemask); +free_notcovered: +#ifdef CONFIG_NUMA + free_cpumask_var(notcovered); +free_covered: + free_cpumask_var(covered); +free_domainspan: + free_cpumask_var(domainspan); +out: +#endif + return err; + +free_sched_groups: +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + goto free_tmpmask; + +#ifdef CONFIG_NUMA +error: + free_sched_groups(cpu_map, tmpmask); + free_rootdomain(rd); + goto free_tmpmask; +#endif +} + +static int build_sched_domains(const struct cpumask *cpu_map) +{ + return __build_sched_domains(cpu_map, NULL); +} + +static struct cpumask *doms_cur; /* current sched domains */ +static int ndoms_cur; /* number of sched domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; + /* attribues of custom domains in 'doms_cur' */ + +/* + * Special case: If a kmalloc of a doms_cur partition (array of + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. + */ +static cpumask_var_t fallback_doms; + +/* + * arch_update_cpu_topology lets virtualized architectures update the + * cpu core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __attribute__((weak)) arch_update_cpu_topology(void) +{ + return 0; +} + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ +static int arch_init_sched_domains(const struct cpumask *cpu_map) +{ + int err; + + arch_update_cpu_topology(); + ndoms_cur = 1; + doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); + if (!doms_cur) + doms_cur = fallback_doms; + cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); + dattr_cur = NULL; + err = build_sched_domains(doms_cur); + register_sched_domain_sysctl(); + + return err; +} + +static void arch_destroy_sched_domains(const struct cpumask *cpu_map, + struct cpumask *tmpmask) +{ + free_sched_groups(cpu_map, tmpmask); +} + +/* + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain + */ +static void detach_destroy_domains(const struct cpumask *cpu_map) +{ + /* Save because hotplug lock held. */ + static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); + int i; + + for_each_cpu(i, cpu_map) + cpu_attach_domain(NULL, &def_root_domain, i); + synchronize_sched(); + arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); +} + +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, + struct sched_domain_attr *new, int idx_new) +{ + struct sched_domain_attr tmp; + + /* fast path */ + if (!new && !cur) + return 1; + + tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, + new ? (new + idx_new) : &tmp, + sizeof(struct sched_domain_attr)); +} + +/* + * Partition sched domains as specified by the 'ndoms_new' + * cpumasks in the array doms_new[] of cpumasks. This compares + * doms_new[] to the current sched domain partitioning, doms_cur[]. + * It destroys each deleted domain and builds each new domain. + * + * 'doms_new' is an array of cpumask's of length 'ndoms_new'. + * The masks don't intersect (don't overlap.) We should setup one + * sched domain for each mask. CPUs not in any of the cpumasks will + * not be load balanced. If the same cpumask appears both in the + * current 'doms_cur' domains and in the new 'doms_new', we can leave + * it as it is. + * + * The passed in 'doms_new' should be kmalloc'd. This routine takes + * ownership of it and will kfree it when done with it. If the caller + * failed the kmalloc call, then it can pass in doms_new == NULL && + * ndoms_new == 1, and partition_sched_domains() will fallback to + * the single partition 'fallback_doms', it also forces the domains + * to be rebuilt. + * + * If doms_new == NULL it will be replaced with cpu_online_mask. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. + * + * Call with hotplug lock held + */ +/* FIXME: Change to struct cpumask *doms_new[] */ +void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, + struct sched_domain_attr *dattr_new) +{ + int i, j, n; + int new_topology; + + mutex_lock(&sched_domains_mutex); + + /* always unregister in case we don't destroy any domains */ + unregister_sched_domain_sysctl(); + + /* Let architecture update cpu core mappings. */ + new_topology = arch_update_cpu_topology(); + + n = doms_new ? ndoms_new : 0; + + /* Destroy deleted domains */ + for (i = 0; i < ndoms_cur; i++) { + for (j = 0; j < n && !new_topology; j++) { + if (cpumask_equal(&doms_cur[i], &doms_new[j]) + && dattrs_equal(dattr_cur, i, dattr_new, j)) + goto match1; + } + /* no match - a current sched domain not in new doms_new[] */ + detach_destroy_domains(doms_cur + i); +match1: + ; + } + + if (doms_new == NULL) { + ndoms_cur = 0; + doms_new = fallback_doms; + cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); + WARN_ON_ONCE(dattr_new); + } + + /* Build new domains */ + for (i = 0; i < ndoms_new; i++) { + for (j = 0; j < ndoms_cur && !new_topology; j++) { + if (cpumask_equal(&doms_new[i], &doms_cur[j]) + && dattrs_equal(dattr_new, i, dattr_cur, j)) + goto match2; + } + /* no match - add a new doms_new */ + __build_sched_domains(doms_new + i, + dattr_new ? dattr_new + i : NULL); +match2: + ; + } + + /* Remember the new sched domains */ + if (doms_cur != fallback_doms) + kfree(doms_cur); + kfree(dattr_cur); /* kfree(NULL) is safe */ + doms_cur = doms_new; + dattr_cur = dattr_new; + ndoms_cur = ndoms_new; + + register_sched_domain_sysctl(); + + mutex_unlock(&sched_domains_mutex); +} + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +static void arch_reinit_sched_domains(void) +{ + get_online_cpus(); + + /* Destroy domains first to force the rebuild */ + partition_sched_domains(0, NULL, NULL); + + rebuild_sched_domains(); + put_online_cpus(); +} + +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) +{ + unsigned int level = 0; + + if (sscanf(buf, "%u", &level) != 1) + return -EINVAL; + + /* + * level is always be positive so don't check for + * level < POWERSAVINGS_BALANCE_NONE which is 0 + * What happens on 0 or 1 byte write, + * need to check for count as well? + */ + + if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) + return -EINVAL; + + if (smt) + sched_smt_power_savings = level; + else + sched_mc_power_savings = level; + + arch_reinit_sched_domains(); + + return count; +} + +#ifdef CONFIG_SCHED_MC +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + char *page) +{ + return sprintf(page, "%u\n", sched_mc_power_savings); +} +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, + const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 0); +} +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); +#endif + +#ifdef CONFIG_SCHED_SMT +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + char *page) +{ + return sprintf(page, "%u\n", sched_smt_power_savings); +} +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, + const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 1); +} +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, + sched_smt_power_savings_show, + sched_smt_power_savings_store); +#endif + +int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +{ + int err = 0; + +#ifdef CONFIG_SCHED_SMT + if (smt_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_smt_power_savings.attr); +#endif +#ifdef CONFIG_SCHED_MC + if (!err && mc_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_mc_power_savings.attr); +#endif + return err; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + +#ifndef CONFIG_CPUSETS +/* + * Add online and remove offline CPUs from the scheduler domains. + * When cpusets are enabled they take over this function. + */ +static int update_sched_domains(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + partition_sched_domains(1, NULL, NULL); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} +#endif + +static int update_runtime(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + disable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + enable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} + +void __init sched_init_smp(void) +{ + cpumask_var_t non_isolated_cpus; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); + +#if defined(CONFIG_NUMA) + sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), + GFP_KERNEL); + BUG_ON(sched_group_nodes_bycpu == NULL); +#endif + get_online_cpus(); + mutex_lock(&sched_domains_mutex); + arch_init_sched_domains(cpu_online_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); + mutex_unlock(&sched_domains_mutex); + put_online_cpus(); + +#ifndef CONFIG_CPUSETS + /* XXX: Theoretical race here - CPU may be hotplugged now */ + hotcpu_notifier(update_sched_domains, 0); +#endif + + /* RT runtime code needs to handle some hotplug events */ + hotcpu_notifier(update_runtime, 0); + + init_hrtick(); + + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) + BUG(); + sched_init_granularity(); + free_cpumask_var(non_isolated_cpus); + + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + init_sched_rt_class(); +} +#else +void __init sched_init_smp(void) +{ + sched_init_granularity(); +} +#endif /* CONFIG_SMP */ + +int in_sched_functions(unsigned long addr) +{ + return in_lock_functions(addr) || + (addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end); +} + +static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +{ + cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->rq = rq; +#endif + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +} + +static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + rt_rq->highest_prio = MAX_RT_PRIO; +#endif +#ifdef CONFIG_SMP + rt_rq->rt_nr_migratory = 0; + rt_rq->overloaded = 0; +#endif + + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + spin_lock_init(&rt_rq->rt_runtime_lock); + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; +#endif +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, int add, + struct sched_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + tg->cfs_rq[cpu] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + if (add) + list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + + tg->se[cpu] = se; + /* se could be NULL for init_task_group */ + if (!se) + return; + + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + + se->my_q = cfs_rq; + se->load.weight = tg->shares; + se->load.inv_weight = 0; + se->parent = parent; +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + tg->rt_rq[cpu] = rt_rq; + init_rt_rq(rt_rq, rq); + rt_rq->tg = tg; + rt_rq->rt_se = rt_se; + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + if (add) + list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); + + tg->rt_se[cpu] = rt_se; + if (!rt_se) + return; + + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + + rt_se->my_q = rt_rq; + rt_se->parent = parent; + INIT_LIST_HEAD(&rt_se->run_list); +} +#endif + +void __init sched_init(void) +{ + int i, j; + unsigned long alloc_size = 0, ptr; + +#ifdef CONFIG_FAIR_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_RT_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_USER_SCHED + alloc_size *= 2; +#endif + /* + * As sched_init() is called before page_alloc is setup, + * we use alloc_bootmem(). + */ + if (alloc_size) { + ptr = (unsigned long)alloc_bootmem(alloc_size); + +#ifdef CONFIG_FAIR_GROUP_SCHED + init_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED + init_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ + } + +#ifdef CONFIG_SMP + init_defrootdomain(); +#endif + + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED + init_rt_bandwidth(&init_task_group.rt_bandwidth, + global_rt_period(), global_rt_runtime()); +#ifdef CONFIG_USER_SCHED + init_rt_bandwidth(&root_task_group.rt_bandwidth, + global_rt_period(), RUNTIME_INF); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_GROUP_SCHED + list_add(&init_task_group.list, &task_groups); + INIT_LIST_HEAD(&init_task_group.children); + +#ifdef CONFIG_USER_SCHED + INIT_LIST_HEAD(&root_task_group.children); + init_task_group.parent = &root_task_group; + list_add(&init_task_group.siblings, &root_task_group.children); +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_GROUP_SCHED */ + + for_each_possible_cpu(i) { + struct rq *rq; + + rq = cpu_rq(i); + spin_lock_init(&rq->lock); + rq->nr_running = 0; + init_cfs_rq(&rq->cfs, rq); + init_rt_rq(&rq->rt, rq); +#ifdef CONFIG_FAIR_GROUP_SCHED + init_task_group.shares = init_task_group_load; + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); +#ifdef CONFIG_CGROUP_SCHED + /* + * How much cpu bandwidth does init_task_group get? + * + * In case of task-groups formed thr' the cgroup filesystem, it + * gets 100% of the cpu resources in the system. This overall + * system cpu resource is divided among the tasks of + * init_task_group and its child task-groups in a fair manner, + * based on each entity's (task or task-group's) weight + * (se->load.weight). + * + * In other words, if init_task_group has 10 tasks of weight + * 1024) and two child groups A0 and A1 (of weight 1024 each), + * then A0's share of the cpu resource is: + * + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * + * We achieve this by letting init_task_group's tasks sit + * directly in rq->cfs (i.e init_task_group->se[] = NULL). + */ + init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + root_task_group.shares = NICE_0_LOAD; + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); + /* + * In case of task-groups formed thr' the user id of tasks, + * init_task_group represents tasks belonging to root user. + * Hence it forms a sibling of all subsequent groups formed. + * In this case, init_task_group gets only a fraction of overall + * system cpu resource, based on the weight assigned to root + * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished + * by letting tasks of init_task_group sit in a separate cfs_rq + * (init_cfs_rq) and having one entity represent this group of + * tasks in rq->cfs (i.e init_task_group->se[] != NULL). + */ + init_tg_cfs_entry(&init_task_group, + &per_cpu(init_cfs_rq, i), + &per_cpu(init_sched_entity, i), i, 1, + root_task_group.se[i]); + +#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; +#ifdef CONFIG_RT_GROUP_SCHED + INIT_LIST_HEAD(&rq->leaf_rt_rq_list); +#ifdef CONFIG_CGROUP_SCHED + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); + init_tg_rt_entry(&init_task_group, + &per_cpu(init_rt_rq, i), + &per_cpu(init_sched_rt_entity, i), i, 1, + root_task_group.rt_se[i]); +#endif +#endif + + for (j = 0; j < CPU_LOAD_IDX_MAX; j++) + rq->cpu_load[j] = 0; +#ifdef CONFIG_SMP + rq->sd = NULL; + rq->rd = NULL; + rq->active_balance = 0; + rq->next_balance = jiffies; + rq->push_cpu = 0; + rq->cpu = i; + rq->online = 0; + rq->migration_thread = NULL; + INIT_LIST_HEAD(&rq->migration_queue); + rq_attach_root(rq, &def_root_domain); +#endif + init_rq_hrtick(rq); + atomic_set(&rq->nr_iowait, 0); + } + + set_load_weight(&init_task); + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); +#endif + +#ifdef CONFIG_RT_MUTEXES + plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); +#endif + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, smp_processor_id()); + /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; + + /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ + alloc_bootmem_cpumask_var(&nohz_cpu_mask); +#ifdef CONFIG_SMP +#ifdef CONFIG_NO_HZ + alloc_bootmem_cpumask_var(&nohz.cpu_mask); +#endif + alloc_bootmem_cpumask_var(&cpu_isolated_map); +#endif /* SMP */ + + scheduler_running = 1; +} + +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +void __might_sleep(char *file, int line) +{ +#ifdef in_atomic + static unsigned long prev_jiffy; /* ratelimiting */ + + if ((!in_atomic() && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); +#endif +} +EXPORT_SYMBOL(__might_sleep); +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +static void normalize_task(struct rq *rq, struct task_struct *p) +{ + int on_rq; + + update_rq_clock(rq); + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq, p, 0); + __setscheduler(rq, p, SCHED_NORMAL, 0); + if (on_rq) { + activate_task(rq, p, 0); + resched_task(rq->curr); + } +} + +void normalize_rt_tasks(void) +{ + struct task_struct *g, *p; + unsigned long flags; + struct rq *rq; + + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, p) { + /* + * Only normalize user tasks: + */ + if (!p->mm) + continue; + + p->se.exec_start = 0; +#ifdef CONFIG_SCHEDSTATS + p->se.wait_start = 0; + p->se.sleep_start = 0; + p->se.block_start = 0; +#endif + + if (!rt_task(p)) { + /* + * Renice negative nice level userspace + * tasks back to 0: + */ + if (TASK_NICE(p) < 0 && p->mm) + set_user_nice(p, 0); + continue; + } + + spin_lock(&p->pi_lock); + rq = __task_rq_lock(p); + + normalize_task(rq, p); + + __task_rq_unlock(rq); + spin_unlock(&p->pi_lock); + } while_each_thread(g, p); + + read_unlock_irqrestore(&tasklist_lock, flags); +} + +#endif /* CONFIG_MAGIC_SYSRQ */ + +#ifdef CONFIG_IA64 +/* + * These functions are only useful for the IA64 MCA handling. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +struct task_struct *curr_task(int cpu) +{ + return cpu_curr(cpu); +} + +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronized, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, struct task_struct *p) +{ + cpu_curr(cpu) = p; +} + +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void free_fair_sched_group(struct task_group *tg) +{ + int i; + + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + } + + kfree(tg->cfs_rq); + kfree(tg->se); +} + +static +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se; + struct rq *rq; + int i; + + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->se) + goto err; + + tg->shares = NICE_0_LOAD; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!cfs_rq) + goto err; + + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!se) + goto err; + + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); + } + + return 1; + + err: + return 0; +} + +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, + &cpu_rq(cpu)->leaf_cfs_rq_list); +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); +} +#else /* !CONFG_FAIR_GROUP_SCHED */ +static inline void free_fair_sched_group(struct task_group *tg) +{ +} + +static inline +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} + +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg) +{ + int i; + + destroy_rt_bandwidth(&tg->rt_bandwidth); + + for_each_possible_cpu(i) { + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); + } + + kfree(tg->rt_rq); + kfree(tg->rt_se); +} + +static +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; + struct rq *rq; + int i; + + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_rq) + goto err; + + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_se) + goto err; + + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); + } + + return 1; + + err: + return 0; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, + &cpu_rq(cpu)->leaf_rt_rq_list); +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); +} +#else /* !CONFIG_RT_GROUP_SCHED */ +static inline void free_rt_sched_group(struct task_group *tg) +{ +} + +static inline +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_GROUP_SCHED +static void free_sched_group(struct task_group *tg) +{ + free_fair_sched_group(tg); + free_rt_sched_group(tg); + kfree(tg); +} + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(struct task_group *parent) +{ + struct task_group *tg; + unsigned long flags; + int i; + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + if (!alloc_fair_sched_group(tg, parent)) + goto err; + + if (!alloc_rt_sched_group(tg, parent)) + goto err; + + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) { + register_fair_sched_group(tg, i); + register_rt_sched_group(tg, i); + } + list_add_rcu(&tg->list, &task_groups); + + WARN_ON(!parent); /* root should already exist */ + + tg->parent = parent; + INIT_LIST_HEAD(&tg->children); + list_add_rcu(&tg->siblings, &parent->children); + spin_unlock_irqrestore(&task_group_lock, flags); + + return tg; + +err: + free_sched_group(tg); + return ERR_PTR(-ENOMEM); +} + +/* rcu callback to free various structures associated with a task group */ +static void free_sched_group_rcu(struct rcu_head *rhp) +{ + /* now it should be safe to free those cfs_rqs */ + free_sched_group(container_of(rhp, struct task_group, rcu)); +} + +/* Destroy runqueue etc associated with a task group */ +void sched_destroy_group(struct task_group *tg) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) { + unregister_fair_sched_group(tg, i); + unregister_rt_sched_group(tg, i); + } + list_del_rcu(&tg->list); + list_del_rcu(&tg->siblings); + spin_unlock_irqrestore(&task_group_lock, flags); + + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, free_sched_group_rcu); +} + +/* change task's runqueue when it moves between groups. + * The caller of this function should have put the task in its new group + * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to + * reflect its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int on_rq, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + update_rq_clock(rq); + + running = task_current(rq, tsk); + on_rq = tsk->se.on_rq; + + if (on_rq) + dequeue_task(rq, tsk, 0); + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); + + set_task_rq(tsk, task_cpu(tsk)); + +#ifdef CONFIG_FAIR_GROUP_SCHED + if (tsk->sched_class->moved_group) + tsk->sched_class->moved_group(tsk); +#endif + + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + if (on_rq) + enqueue_task(rq, tsk, 0); + + task_rq_unlock(rq, &flags); +} +#endif /* CONFIG_GROUP_SCHED */ + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void __set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + int on_rq; + + on_rq = se->on_rq; + if (on_rq) + dequeue_entity(cfs_rq, se, 0); + + se->load.weight = shares; + se->load.inv_weight = 0; + + if (on_rq) + enqueue_entity(cfs_rq, se, 0); +} + +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __set_se_shares(se, shares); + spin_unlock_irqrestore(&rq->lock, flags); +} + +static DEFINE_MUTEX(shares_mutex); + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int i; + unsigned long flags; + + /* + * We can't change the weight of the root cgroup. + */ + if (!tg->se[0]) + return -EINVAL; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; + + mutex_lock(&shares_mutex); + if (tg->shares == shares) + goto done; + + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) + unregister_fair_sched_group(tg, i); + list_del_rcu(&tg->siblings); + spin_unlock_irqrestore(&task_group_lock, flags); + + /* wait for any ongoing reference to this group to finish */ + synchronize_sched(); + + /* + * Now we are free to modify the group's share on each cpu + * w/o tripping rebalance_share or load_balance_fair. + */ + tg->shares = shares; + for_each_possible_cpu(i) { + /* + * force a rebalance + */ + cfs_rq_set_shares(tg->cfs_rq[i], 0); + set_se_shares(tg->se[i], shares); + } + + /* + * Enable load balance activity on this group, by inserting it back on + * each cpu's rq->leaf_cfs_rq_list. + */ + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) + register_fair_sched_group(tg, i); + list_add_rcu(&tg->siblings, &tg->parent->children); + spin_unlock_irqrestore(&task_group_lock, flags); +done: + mutex_unlock(&shares_mutex); + return 0; +} + +unsigned long sched_group_shares(struct task_group *tg) +{ + return tg->shares; +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); + +static unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 20; + + return div64_u64(runtime << 20, period); +} + +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) +{ + struct task_struct *g, *p; + + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); + + return 0; +} + +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; + +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; + + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + +#ifdef CONFIG_USER_SCHED + if (tg == &root_task_group) { + period = global_rt_period(); + runtime = global_rt_runtime(); + } +#endif + + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); + } + + if (sum > total) + return -EINVAL; + + return 0; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); +} + +static int tg_set_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) +{ + int i, err = 0; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) + goto unlock; + + spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); + unlock: + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return err; +} + +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_bandwidth.rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} + +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + if (rt_period == 0) + return -EINVAL; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + u64 runtime, period; + int ret = 0; + + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return ret; +} + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + +#else /* !CONFIG_RT_GROUP_SCHED */ +static int sched_rt_global_constraints(void) +{ + unsigned long flags; + int i; + + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + + return 0; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +int sched_rt_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_constraints(); + if (ret) { + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } else { + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = + ns_to_ktime(global_rt_period()); + } + } + mutex_unlock(&mutex); + + return ret; +} + +#ifdef CONFIG_CGROUP_SCHED + +/* return corresponding task_group object of a cgroup */ +static inline struct task_group *cgroup_tg(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), + struct task_group, css); +} + +static struct cgroup_subsys_state * +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct task_group *tg, *parent; + + if (!cgrp->parent) { + /* This is early initialization for the top cgroup */ + return &init_task_group.css; + } + + parent = cgroup_tg(cgrp->parent); + tg = sched_create_group(parent); + if (IS_ERR(tg)) + return ERR_PTR(-ENOMEM); + + return &tg->css; +} + +static void +cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + + sched_destroy_group(tg); +} + +static int +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk) +{ +#ifdef CONFIG_RT_GROUP_SCHED + if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) + return -EINVAL; +#else + /* We don't support RT-tasks being in separate groups */ + if (tsk->sched_class != &fair_sched_class) + return -EINVAL; +#endif + + return 0; +} + +static void +cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cont, struct task_struct *tsk) +{ + sched_move_task(tsk); +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 shareval) +{ + return sched_group_set_shares(cgroup_tg(cgrp), shareval); +} + +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cgrp); + + return (u64) tg->shares; +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, + s64 val) +{ + return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); +} + +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_runtime(cgroup_tg(cgrp)); +} + +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_period_us) +{ + return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} + +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_period(cgroup_tg(cgrp)); +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "shares", + .read_u64 = cpu_shares_read_u64, + .write_u64 = cpu_shares_write_u64, + }, +#endif +#ifdef CONFIG_RT_GROUP_SCHED + { + .name = "rt_runtime_us", + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, + }, + { + .name = "rt_period_us", + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, + }, +#endif +}; + +static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); +} + +struct cgroup_subsys cpu_cgroup_subsys = { + .name = "cpu", + .create = cpu_cgroup_create, + .destroy = cpu_cgroup_destroy, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, + .populate = cpu_cgroup_populate, + .subsys_id = cpu_cgroup_subsys_id, + .early_init = 1, +}; + +#endif /* CONFIG_CGROUP_SCHED */ + +#ifdef CONFIG_CGROUP_CPUACCT + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 *cpuusage; + struct cpuacct *parent; +}; + +struct cgroup_subsys cpuacct_subsys; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_create( + struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + + if (!ca) + return ERR_PTR(-ENOMEM); + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) { + kfree(ca); + return ERR_PTR(-ENOMEM); + } + + if (cgrp->parent) + ca->parent = cgroup_ca(cgrp->parent); + + return &ca->css; +} + +/* destroy an existing cpu accounting group */ +static void +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + +}; + +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); +} + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +static void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + if (!cpuacct_subsys.active) + return; + + cpu = task_cpu(tsk); + ca = task_ca(tsk); + + for (; ca; ca = ca->parent) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + } +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .create = cpuacct_create, + .destroy = cpuacct_destroy, + .populate = cpuacct_populate, + .subsys_id = cpuacct_subsys_id, +}; +#endif /* CONFIG_CGROUP_CPUACCT */ +#endif /* !DDE_LINUX */ diff --git a/libdde-linux26/lib/src/kernel/sched_cpupri.h b/libdde-linux26/lib/src/kernel/sched_cpupri.h new file mode 100644 index 00000000..642a94ef --- /dev/null +++ b/libdde-linux26/lib/src/kernel/sched_cpupri.h @@ -0,0 +1,37 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include <linux/sched.h> + +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + spinlock_t lock; + int count; + cpumask_var_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + long pri_active[CPUPRI_NR_PRI_WORDS]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, cpumask_t *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +int cpupri_init(struct cpupri *cp, bool bootmem); +void cpupri_cleanup(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ diff --git a/libdde-linux26/lib/src/kernel/sys.c b/libdde-linux26/lib/src/kernel/sys.c new file mode 100644 index 00000000..6533cb97 --- /dev/null +++ b/libdde-linux26/lib/src/kernel/sys.c @@ -0,0 +1,1893 @@ +/* + * linux/kernel/sys.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/utsname.h> +#include <linux/mman.h> +#include <linux/smp_lock.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/prctl.h> +#include <linux/highuid.h> +#include <linux/fs.h> +#include <linux/resource.h> +#include <linux/kernel.h> +#include <linux/kexec.h> +#include <linux/workqueue.h> +#include <linux/capability.h> +#include <linux/device.h> +#include <linux/key.h> +#include <linux/times.h> +#include <linux/posix-timers.h> +#include <linux/security.h> +#include <linux/dcookies.h> +#include <linux/suspend.h> +#include <linux/tty.h> +#include <linux/signal.h> +#include <linux/cn_proc.h> +#include <linux/getcpu.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/seccomp.h> +#include <linux/cpu.h> +#include <linux/ptrace.h> + +#include <linux/compat.h> +#include <linux/syscalls.h> +#include <linux/kprobes.h> +#include <linux/user_namespace.h> + +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/unistd.h> + +#ifndef SET_UNALIGN_CTL +# define SET_UNALIGN_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_UNALIGN_CTL +# define GET_UNALIGN_CTL(a,b) (-EINVAL) +#endif +#ifndef SET_FPEMU_CTL +# define SET_FPEMU_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_FPEMU_CTL +# define GET_FPEMU_CTL(a,b) (-EINVAL) +#endif +#ifndef SET_FPEXC_CTL +# define SET_FPEXC_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_FPEXC_CTL +# define GET_FPEXC_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_ENDIAN +# define GET_ENDIAN(a,b) (-EINVAL) +#endif +#ifndef SET_ENDIAN +# define SET_ENDIAN(a,b) (-EINVAL) +#endif +#ifndef GET_TSC_CTL +# define GET_TSC_CTL(a) (-EINVAL) +#endif +#ifndef SET_TSC_CTL +# define SET_TSC_CTL(a) (-EINVAL) +#endif + +#ifndef DDE_LINUX +/* + * this is where the system-wide overflow UID and GID are defined, for + * architectures that now have 32-bit UID/GID but didn't in the past + */ + +int overflowuid = DEFAULT_OVERFLOWUID; +int overflowgid = DEFAULT_OVERFLOWGID; + +#ifdef CONFIG_UID16 +EXPORT_SYMBOL(overflowuid); +EXPORT_SYMBOL(overflowgid); +#endif + +/* + * the same as above, but for filesystems which can only store a 16-bit + * UID and GID. as such, this is needed on all architectures + */ + +int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; +int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; + +EXPORT_SYMBOL(fs_overflowuid); +EXPORT_SYMBOL(fs_overflowgid); + +/* + * this indicates whether you can reboot with ctrl-alt-del: the default is yes + */ + +int C_A_D = 1; +#endif /* DDE_LINUX */ +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); + +/* + * If set, this is used for preparing the system to power off. + */ + +void (*pm_power_off_prepare)(void); + +#ifndef DDE_LINUX +/* + * set the priority of a task + * - the caller must hold the RCU read lock + */ +static int set_one_prio(struct task_struct *p, int niceval, int error) +{ + const struct cred *cred = current_cred(), *pcred = __task_cred(p); + int no_nice; + + if (pcred->uid != cred->euid && + pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { + error = -EPERM; + goto out; + } + if (niceval < task_nice(p) && !can_nice(p, niceval)) { + error = -EACCES; + goto out; + } + no_nice = security_task_setnice(p, niceval); + if (no_nice) { + error = no_nice; + goto out; + } + if (error == -ESRCH) + error = 0; + set_user_nice(p, niceval); +out: + return error; +} + +SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) +{ + struct task_struct *g, *p; + struct user_struct *user; + const struct cred *cred = current_cred(); + int error = -EINVAL; + struct pid *pgrp; + + if (which > PRIO_USER || which < PRIO_PROCESS) + goto out; + + /* normalize: avoid signed division (rounding problems) */ + error = -ESRCH; + if (niceval < -20) + niceval = -20; + if (niceval > 19) + niceval = 19; + + read_lock(&tasklist_lock); + switch (which) { + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + error = set_one_prio(p, niceval, error); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = (struct user_struct *) cred->user; + if (!who) + who = cred->uid; + else if ((who != cred->uid) && + !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + + do_each_thread(g, p) + if (__task_cred(p)->uid == who) + error = set_one_prio(p, niceval, error); + while_each_thread(g, p); + if (who != cred->uid) + free_uid(user); /* For find_user() */ + break; + } +out_unlock: + read_unlock(&tasklist_lock); +out: + return error; +} + +/* + * Ugh. To avoid negative return values, "getpriority()" will + * not return the normal nice-value, but a negated value that + * has been offset by 20 (ie it returns 40..1 instead of -20..19) + * to stay compatible. + */ +SYSCALL_DEFINE2(getpriority, int, which, int, who) +{ + struct task_struct *g, *p; + struct user_struct *user; + const struct cred *cred = current_cred(); + long niceval, retval = -ESRCH; + struct pid *pgrp; + + if (which > PRIO_USER || which < PRIO_PROCESS) + return -EINVAL; + + read_lock(&tasklist_lock); + switch (which) { + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = (struct user_struct *) cred->user; + if (!who) + who = cred->uid; + else if ((who != cred->uid) && + !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + + do_each_thread(g, p) + if (__task_cred(p)->uid == who) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } + while_each_thread(g, p); + if (who != cred->uid) + free_uid(user); /* for find_user() */ + break; + } +out_unlock: + read_unlock(&tasklist_lock); + + return retval; +} + +/** + * emergency_restart - reboot the system + * + * Without shutting down any hardware or taking any locks + * reboot the system. This is called when we know we are in + * trouble so this is our best effort to reboot. This is + * safe to call in interrupt context. + */ +void emergency_restart(void) +{ + machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + +void kernel_restart_prepare(char *cmd) +{ + blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + system_state = SYSTEM_RESTART; + device_shutdown(); + sysdev_shutdown(); +} + +/** + * kernel_restart - reboot the system + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * + * Shutdown everything and perform a clean reboot. + * This is not safe to call in interrupt context. + */ +void kernel_restart(char *cmd) +{ + kernel_restart_prepare(cmd); + if (!cmd) + printk(KERN_EMERG "Restarting system.\n"); + else + printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); + +static void kernel_shutdown_prepare(enum system_states state) +{ + blocking_notifier_call_chain(&reboot_notifier_list, + (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + system_state = state; + device_shutdown(); +} +/** + * kernel_halt - halt the system + * + * Shutdown everything and perform a clean system halt. + */ +void kernel_halt(void) +{ + kernel_shutdown_prepare(SYSTEM_HALT); + sysdev_shutdown(); + printk(KERN_EMERG "System halted.\n"); + machine_halt(); +} + +EXPORT_SYMBOL_GPL(kernel_halt); + +/** + * kernel_power_off - power_off the system + * + * Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off(void) +{ + kernel_shutdown_prepare(SYSTEM_POWER_OFF); + if (pm_power_off_prepare) + pm_power_off_prepare(); + disable_nonboot_cpus(); + sysdev_shutdown(); + printk(KERN_EMERG "Power down.\n"); + machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); +/* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers + * so that some mistake won't make this reboot the whole machine. + * You can also set the meaning of the ctrl-alt-del-key here. + * + * reboot doesn't sync: do that yourself before calling this. + */ +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, + void __user *, arg) +{ + char buffer[256]; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + /* For safety, we require "magic" arguments. */ + if (magic1 != LINUX_REBOOT_MAGIC1 || + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && + magic2 != LINUX_REBOOT_MAGIC2B && + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + + lock_kernel(); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + kernel_restart(NULL); + break; + + case LINUX_REBOOT_CMD_CAD_ON: + C_A_D = 1; + break; + + case LINUX_REBOOT_CMD_CAD_OFF: + C_A_D = 0; + break; + + case LINUX_REBOOT_CMD_HALT: + kernel_halt(); + unlock_kernel(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_POWER_OFF: + kernel_power_off(); + unlock_kernel(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_RESTART2: + if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + unlock_kernel(); + return -EFAULT; + } + buffer[sizeof(buffer) - 1] = '\0'; + + kernel_restart(buffer); + break; + +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + { + int ret; + ret = kernel_kexec(); + unlock_kernel(); + return ret; + } +#endif + +#ifdef CONFIG_HIBERNATION + case LINUX_REBOOT_CMD_SW_SUSPEND: + { + int ret = hibernate(); + unlock_kernel(); + return ret; + } +#endif + + default: + unlock_kernel(); + return -EINVAL; + } + unlock_kernel(); + return 0; +} + +static void deferred_cad(struct work_struct *dummy) +{ + kernel_restart(NULL); +} + +/* + * This function gets called by ctrl-alt-del - ie the keyboard interrupt. + * As it's called within an interrupt, it may NOT sync: the only choice + * is whether to reboot at once, or just ignore the ctrl-alt-del. + */ +void ctrl_alt_del(void) +{ + static DECLARE_WORK(cad_work, deferred_cad); + + if (C_A_D) + schedule_work(&cad_work); + else + kill_cad_pid(SIGINT, 1); +} + +/* + * Unprivileged users may change the real gid to the effective gid + * or vice versa. (BSD-style) + * + * If you set the real gid at all, or set the effective gid to a value not + * equal to the real gid, then the saved gid is set to the new effective gid. + * + * This makes it possible for a setgid program to completely drop its + * privileges, which is often a useful assertion to make when you are doing + * a security audit over a program. + * + * The general idea is that a program which uses just setregid() will be + * 100% compatible with BSD. A program which uses just setgid() will be + * 100% compatible with POSIX with saved IDs. + * + * SMP: There are not races, the GIDs are checked only by filesystem + * operations (as far as semantic preservation is concerned). + */ +SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); + if (retval) + goto error; + + retval = -EPERM; + if (rgid != (gid_t) -1) { + if (old->gid == rgid || + old->egid == rgid || + capable(CAP_SETGID)) + new->gid = rgid; + else + goto error; + } + if (egid != (gid_t) -1) { + if (old->gid == egid || + old->egid == egid || + old->sgid == egid || + capable(CAP_SETGID)) + new->egid = egid; + else + goto error; + } + + if (rgid != (gid_t) -1 || + (egid != (gid_t) -1 && egid != old->gid)) + new->sgid = new->egid; + new->fsgid = new->egid; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +/* + * setgid() is implemented like SysV w/ SAVED_IDS + * + * SMP: Same implicit races as above. + */ +SYSCALL_DEFINE1(setgid, gid_t, gid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); + if (retval) + goto error; + + retval = -EPERM; + if (capable(CAP_SETGID)) + new->gid = new->egid = new->sgid = new->fsgid = gid; + else if (gid == old->gid || gid == old->sgid) + new->egid = new->fsgid = gid; + else + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +/* + * change the user struct in a credentials set to match the new UID + */ +static int set_user(struct cred *new) +{ + struct user_struct *new_user; + + new_user = alloc_uid(current_user_ns(), new->uid); + if (!new_user) + return -EAGAIN; + + if (!task_can_switch_user(new_user, current)) { + free_uid(new_user); + return -EINVAL; + } + + if (atomic_read(&new_user->processes) >= + current->signal->rlim[RLIMIT_NPROC].rlim_cur && + new_user != INIT_USER) { + free_uid(new_user); + return -EAGAIN; + } + + free_uid(new->user); + new->user = new_user; + return 0; +} + +/* + * Unprivileged users may change the real uid to the effective uid + * or vice versa. (BSD-style) + * + * If you set the real uid at all, or set the effective uid to a value not + * equal to the real uid, then the saved uid is set to the new effective uid. + * + * This makes it possible for a setuid program to completely drop its + * privileges, which is often a useful assertion to make when you are doing + * a security audit over a program. + * + * The general idea is that a program which uses just setreuid() will be + * 100% compatible with BSD. A program which uses just setuid() will be + * 100% compatible with POSIX with saved IDs. + */ +SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); + if (retval) + goto error; + + retval = -EPERM; + if (ruid != (uid_t) -1) { + new->uid = ruid; + if (old->uid != ruid && + old->euid != ruid && + !capable(CAP_SETUID)) + goto error; + } + + if (euid != (uid_t) -1) { + new->euid = euid; + if (old->uid != euid && + old->euid != euid && + old->suid != euid && + !capable(CAP_SETUID)) + goto error; + } + + if (new->uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } + if (ruid != (uid_t) -1 || + (euid != (uid_t) -1 && euid != old->uid)) + new->suid = new->euid; + new->fsuid = new->euid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_RE); + if (retval < 0) + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +/* + * setuid() is implemented like SysV with SAVED_IDS + * + * Note that SAVED_ID's is deficient in that a setuid root program + * like sendmail, for example, cannot set its uid to be a normal + * user and then switch back, because if you're root, setuid() sets + * the saved uid too. If you don't like this, blame the bright people + * in the POSIX committee and/or USG. Note that the BSD-style setreuid() + * will allow a root program to temporarily drop privileges and be able to + * regain them by swapping the real and effective uid. + */ +SYSCALL_DEFINE1(setuid, uid_t, uid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); + if (retval) + goto error; + + retval = -EPERM; + if (capable(CAP_SETUID)) { + new->suid = new->uid = uid; + if (uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } + } else if (uid != old->uid && uid != new->suid) { + goto error; + } + + new->fsuid = new->euid = uid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_ID); + if (retval < 0) + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + + +/* + * This function implements a generic ability to update ruid, euid, + * and suid. This allows you to implement the 4.4 compatible seteuid(). + */ +SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); + if (retval) + goto error; + old = current_cred(); + + retval = -EPERM; + if (!capable(CAP_SETUID)) { + if (ruid != (uid_t) -1 && ruid != old->uid && + ruid != old->euid && ruid != old->suid) + goto error; + if (euid != (uid_t) -1 && euid != old->uid && + euid != old->euid && euid != old->suid) + goto error; + if (suid != (uid_t) -1 && suid != old->uid && + suid != old->euid && suid != old->suid) + goto error; + } + + if (ruid != (uid_t) -1) { + new->uid = ruid; + if (ruid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } + } + if (euid != (uid_t) -1) + new->euid = euid; + if (suid != (uid_t) -1) + new->suid = suid; + new->fsuid = new->euid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_RES); + if (retval < 0) + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) +{ + const struct cred *cred = current_cred(); + int retval; + + if (!(retval = put_user(cred->uid, ruid)) && + !(retval = put_user(cred->euid, euid))) + retval = put_user(cred->suid, suid); + + return retval; +} + +/* + * Same as above, but for rgid, egid, sgid. + */ +SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); + if (retval) + goto error; + + retval = -EPERM; + if (!capable(CAP_SETGID)) { + if (rgid != (gid_t) -1 && rgid != old->gid && + rgid != old->egid && rgid != old->sgid) + goto error; + if (egid != (gid_t) -1 && egid != old->gid && + egid != old->egid && egid != old->sgid) + goto error; + if (sgid != (gid_t) -1 && sgid != old->gid && + sgid != old->egid && sgid != old->sgid) + goto error; + } + + if (rgid != (gid_t) -1) + new->gid = rgid; + if (egid != (gid_t) -1) + new->egid = egid; + if (sgid != (gid_t) -1) + new->sgid = sgid; + new->fsgid = new->egid; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) +{ + const struct cred *cred = current_cred(); + int retval; + + if (!(retval = put_user(cred->gid, rgid)) && + !(retval = put_user(cred->egid, egid))) + retval = put_user(cred->sgid, sgid); + + return retval; +} + + +/* + * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This + * is used for "access()" and for the NFS daemon (letting nfsd stay at + * whatever uid it wants to). It normally shadows "euid", except when + * explicitly set by setfsuid() or for access.. + */ +SYSCALL_DEFINE1(setfsuid, uid_t, uid) +{ + const struct cred *old; + struct cred *new; + uid_t old_fsuid; + + new = prepare_creds(); + if (!new) + return current_fsuid(); + old = current_cred(); + old_fsuid = old->fsuid; + + if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0) + goto error; + + if (uid == old->uid || uid == old->euid || + uid == old->suid || uid == old->fsuid || + capable(CAP_SETUID)) { + if (uid != old_fsuid) { + new->fsuid = uid; + if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) + goto change_okay; + } + } + +error: + abort_creds(new); + return old_fsuid; + +change_okay: + commit_creds(new); + return old_fsuid; +} + +/* + * Samma på svenska.. + */ +SYSCALL_DEFINE1(setfsgid, gid_t, gid) +{ + const struct cred *old; + struct cred *new; + gid_t old_fsgid; + + new = prepare_creds(); + if (!new) + return current_fsgid(); + old = current_cred(); + old_fsgid = old->fsgid; + + if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) + goto error; + + if (gid == old->gid || gid == old->egid || + gid == old->sgid || gid == old->fsgid || + capable(CAP_SETGID)) { + if (gid != old_fsgid) { + new->fsgid = gid; + goto change_okay; + } + } + +error: + abort_creds(new); + return old_fsgid; + +change_okay: + commit_creds(new); + return old_fsgid; +} + +void do_sys_times(struct tms *tms) +{ + struct task_cputime cputime; + cputime_t cutime, cstime; + + thread_group_cputime(current, &cputime); + spin_lock_irq(¤t->sighand->siglock); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + spin_unlock_irq(¤t->sighand->siglock); + tms->tms_utime = cputime_to_clock_t(cputime.utime); + tms->tms_stime = cputime_to_clock_t(cputime.stime); + tms->tms_cutime = cputime_to_clock_t(cutime); + tms->tms_cstime = cputime_to_clock_t(cstime); +} + +SYSCALL_DEFINE1(times, struct tms __user *, tbuf) +{ + if (tbuf) { + struct tms tmp; + + do_sys_times(&tmp); + if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) + return -EFAULT; + } + force_successful_syscall_return(); + return (long) jiffies_64_to_clock_t(get_jiffies_64()); +} + +/* + * This needs some heavy checking ... + * I just haven't the stomach for it. I also don't fully + * understand sessions/pgrp etc. Let somebody who does explain it. + * + * OK, I think I have the protection semantics right.... this is really + * only important on a multi-user system anyway, to make sure one user + * can't send a signal to a process owned by another. -TYT, 12/12/91 + * + * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. + * LBT 04.03.94 + */ +SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) +{ + struct task_struct *p; + struct task_struct *group_leader = current->group_leader; + struct pid *pgrp; + int err; + + if (!pid) + pid = task_pid_vnr(group_leader); + if (!pgid) + pgid = pid; + if (pgid < 0) + return -EINVAL; + + /* From this point forward we keep holding onto the tasklist lock + * so that our parent does not change from under us. -DaveM + */ + write_lock_irq(&tasklist_lock); + + err = -ESRCH; + p = find_task_by_vpid(pid); + if (!p) + goto out; + + err = -EINVAL; + if (!thread_group_leader(p)) + goto out; + + if (same_thread_group(p->real_parent, group_leader)) { + err = -EPERM; + if (task_session(p) != task_session(group_leader)) + goto out; + err = -EACCES; + if (p->did_exec) + goto out; + } else { + err = -ESRCH; + if (p != group_leader) + goto out; + } + + err = -EPERM; + if (p->signal->leader) + goto out; + + pgrp = task_pid(p); + if (pgid != pid) { + struct task_struct *g; + + pgrp = find_vpid(pgid); + g = pid_task(pgrp, PIDTYPE_PGID); + if (!g || task_session(g) != task_session(group_leader)) + goto out; + } + + err = security_task_setpgid(p, pgid); + if (err) + goto out; + + if (task_pgrp(p) != pgrp) { + change_pid(p, PIDTYPE_PGID, pgrp); + set_task_pgrp(p, pid_nr(pgrp)); + } + + err = 0; +out: + /* All paths lead to here, thus we are safe. -DaveM */ + write_unlock_irq(&tasklist_lock); + return err; +} + +SYSCALL_DEFINE1(getpgid, pid_t, pid) +{ + struct task_struct *p; + struct pid *grp; + int retval; + + rcu_read_lock(); + if (!pid) + grp = task_pgrp(current); + else { + retval = -ESRCH; + p = find_task_by_vpid(pid); + if (!p) + goto out; + grp = task_pgrp(p); + if (!grp) + goto out; + + retval = security_task_getpgid(p); + if (retval) + goto out; + } + retval = pid_vnr(grp); +out: + rcu_read_unlock(); + return retval; +} + +#ifdef __ARCH_WANT_SYS_GETPGRP + +SYSCALL_DEFINE0(getpgrp) +{ + return sys_getpgid(0); +} + +#endif + +SYSCALL_DEFINE1(getsid, pid_t, pid) +{ + struct task_struct *p; + struct pid *sid; + int retval; + + rcu_read_lock(); + if (!pid) + sid = task_session(current); + else { + retval = -ESRCH; + p = find_task_by_vpid(pid); + if (!p) + goto out; + sid = task_session(p); + if (!sid) + goto out; + + retval = security_task_getsid(p); + if (retval) + goto out; + } + retval = pid_vnr(sid); +out: + rcu_read_unlock(); + return retval; +} + +SYSCALL_DEFINE0(setsid) +{ + struct task_struct *group_leader = current->group_leader; + struct pid *sid = task_pid(group_leader); + pid_t session = pid_vnr(sid); + int err = -EPERM; + + write_lock_irq(&tasklist_lock); + /* Fail if I am already a session leader */ + if (group_leader->signal->leader) + goto out; + + /* Fail if a process group id already exists that equals the + * proposed session id. + */ + if (pid_task(sid, PIDTYPE_PGID)) + goto out; + + group_leader->signal->leader = 1; + __set_special_pids(sid); + + proc_clear_tty(group_leader); + + err = session; +out: + write_unlock_irq(&tasklist_lock); + return err; +} + +/* + * Supplementary group IDs + */ + +/* init to 2 - one for init_task, one to ensure it is never freed */ +struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; + +struct group_info *groups_alloc(int gidsetsize) +{ + struct group_info *group_info; + int nblocks; + int i; + + nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; + /* Make sure we always allocate at least one indirect block pointer */ + nblocks = nblocks ? : 1; + group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); + if (!group_info) + return NULL; + group_info->ngroups = gidsetsize; + group_info->nblocks = nblocks; + atomic_set(&group_info->usage, 1); + + if (gidsetsize <= NGROUPS_SMALL) + group_info->blocks[0] = group_info->small_block; + else { + for (i = 0; i < nblocks; i++) { + gid_t *b; + b = (void *)__get_free_page(GFP_USER); + if (!b) + goto out_undo_partial_alloc; + group_info->blocks[i] = b; + } + } + return group_info; + +out_undo_partial_alloc: + while (--i >= 0) { + free_page((unsigned long)group_info->blocks[i]); + } + kfree(group_info); + return NULL; +} + +EXPORT_SYMBOL(groups_alloc); + +void groups_free(struct group_info *group_info) +{ + if (group_info->blocks[0] != group_info->small_block) { + int i; + for (i = 0; i < group_info->nblocks; i++) + free_page((unsigned long)group_info->blocks[i]); + } + kfree(group_info); +} + +EXPORT_SYMBOL(groups_free); + +/* export the group_info to a user-space array */ +static int groups_to_user(gid_t __user *grouplist, + const struct group_info *group_info) +{ + int i; + unsigned int count = group_info->ngroups; + + for (i = 0; i < group_info->nblocks; i++) { + unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); + unsigned int len = cp_count * sizeof(*grouplist); + + if (copy_to_user(grouplist, group_info->blocks[i], len)) + return -EFAULT; + + grouplist += NGROUPS_PER_BLOCK; + count -= cp_count; + } + return 0; +} + +/* fill a group_info from a user-space array - it must be allocated already */ +static int groups_from_user(struct group_info *group_info, + gid_t __user *grouplist) +{ + int i; + unsigned int count = group_info->ngroups; + + for (i = 0; i < group_info->nblocks; i++) { + unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); + unsigned int len = cp_count * sizeof(*grouplist); + + if (copy_from_user(group_info->blocks[i], grouplist, len)) + return -EFAULT; + + grouplist += NGROUPS_PER_BLOCK; + count -= cp_count; + } + return 0; +} + +/* a simple Shell sort */ +static void groups_sort(struct group_info *group_info) +{ + int base, max, stride; + int gidsetsize = group_info->ngroups; + + for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) + ; /* nothing */ + stride /= 3; + + while (stride) { + max = gidsetsize - stride; + for (base = 0; base < max; base++) { + int left = base; + int right = left + stride; + gid_t tmp = GROUP_AT(group_info, right); + + while (left >= 0 && GROUP_AT(group_info, left) > tmp) { + GROUP_AT(group_info, right) = + GROUP_AT(group_info, left); + right = left; + left -= stride; + } + GROUP_AT(group_info, right) = tmp; + } + stride /= 3; + } +} + +/* a simple bsearch */ +int groups_search(const struct group_info *group_info, gid_t grp) +{ + unsigned int left, right; + + if (!group_info) + return 0; + + left = 0; + right = group_info->ngroups; + while (left < right) { + unsigned int mid = (left+right)/2; + int cmp = grp - GROUP_AT(group_info, mid); + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return 1; + } + return 0; +} + +/** + * set_groups - Change a group subscription in a set of credentials + * @new: The newly prepared set of credentials to alter + * @group_info: The group list to install + * + * Validate a group subscription and, if valid, insert it into a set + * of credentials. + */ +int set_groups(struct cred *new, struct group_info *group_info) +{ + int retval; + + retval = security_task_setgroups(group_info); + if (retval) + return retval; + + put_group_info(new->group_info); + groups_sort(group_info); + get_group_info(group_info); + new->group_info = group_info; + return 0; +} + +EXPORT_SYMBOL(set_groups); + +/** + * set_current_groups - Change current's group subscription + * @group_info: The group list to impose + * + * Validate a group subscription and, if valid, impose it upon current's task + * security record. + */ +int set_current_groups(struct group_info *group_info) +{ + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = set_groups(new, group_info); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); +} + +EXPORT_SYMBOL(set_current_groups); + +SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) +{ + const struct cred *cred = current_cred(); + int i; + + if (gidsetsize < 0) + return -EINVAL; + + /* no need to grab task_lock here; it cannot change */ + i = cred->group_info->ngroups; + if (gidsetsize) { + if (i > gidsetsize) { + i = -EINVAL; + goto out; + } + if (groups_to_user(grouplist, cred->group_info)) { + i = -EFAULT; + goto out; + } + } +out: + return i; +} + +/* + * SMP: Our groups are copy-on-write. We can set them safely + * without another task interfering. + */ + +SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) +{ + struct group_info *group_info; + int retval; + + if (!capable(CAP_SETGID)) + return -EPERM; + if ((unsigned)gidsetsize > NGROUPS_MAX) + return -EINVAL; + + group_info = groups_alloc(gidsetsize); + if (!group_info) + return -ENOMEM; + retval = groups_from_user(group_info, grouplist); + if (retval) { + put_group_info(group_info); + return retval; + } + + retval = set_current_groups(group_info); + put_group_info(group_info); + + return retval; +} + +/* + * Check whether we're fsgid/egid or in the supplemental group.. + */ +int in_group_p(gid_t grp) +{ + const struct cred *cred = current_cred(); + int retval = 1; + + if (grp != cred->fsgid) + retval = groups_search(cred->group_info, grp); + return retval; +} + +EXPORT_SYMBOL(in_group_p); + +int in_egroup_p(gid_t grp) +{ + const struct cred *cred = current_cred(); + int retval = 1; + + if (grp != cred->egid) + retval = groups_search(cred->group_info, grp); + return retval; +} + +EXPORT_SYMBOL(in_egroup_p); + +DECLARE_RWSEM(uts_sem); + +SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) +{ + int errno = 0; + + down_read(&uts_sem); + if (copy_to_user(name, utsname(), sizeof *name)) + errno = -EFAULT; + up_read(&uts_sem); + return errno; +} + +SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) +{ + int errno; + char tmp[__NEW_UTS_LEN]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { + struct new_utsname *u = utsname(); + + memcpy(u->nodename, tmp, len); + memset(u->nodename + len, 0, sizeof(u->nodename) - len); + errno = 0; + } + up_write(&uts_sem); + return errno; +} + +#ifdef __ARCH_WANT_SYS_GETHOSTNAME + +SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) +{ + int i, errno; + struct new_utsname *u; + + if (len < 0) + return -EINVAL; + down_read(&uts_sem); + u = utsname(); + i = 1 + strlen(u->nodename); + if (i > len) + i = len; + errno = 0; + if (copy_to_user(name, u->nodename, i)) + errno = -EFAULT; + up_read(&uts_sem); + return errno; +} + +#endif + +/* + * Only setdomainname; getdomainname can be implemented by calling + * uname() + */ +SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) +{ + int errno; + char tmp[__NEW_UTS_LEN]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; + + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { + struct new_utsname *u = utsname(); + + memcpy(u->domainname, tmp, len); + memset(u->domainname + len, 0, sizeof(u->domainname) - len); + errno = 0; + } + up_write(&uts_sem); + return errno; +} + +SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) +{ + if (resource >= RLIM_NLIMITS) + return -EINVAL; + else { + struct rlimit value; + task_lock(current->group_leader); + value = current->signal->rlim[resource]; + task_unlock(current->group_leader); + return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; + } +} + +#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT + +/* + * Back compatibility for getrlimit. Needed for some apps. + */ + +SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct rlimit __user *, rlim) +{ + struct rlimit x; + if (resource >= RLIM_NLIMITS) + return -EINVAL; + + task_lock(current->group_leader); + x = current->signal->rlim[resource]; + task_unlock(current->group_leader); + if (x.rlim_cur > 0x7FFFFFFF) + x.rlim_cur = 0x7FFFFFFF; + if (x.rlim_max > 0x7FFFFFFF) + x.rlim_max = 0x7FFFFFFF; + return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; +} + +#endif + +SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +{ + struct rlimit new_rlim, *old_rlim; + int retval; + + if (resource >= RLIM_NLIMITS) + return -EINVAL; + if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + return -EFAULT; + if (new_rlim.rlim_cur > new_rlim.rlim_max) + return -EINVAL; + old_rlim = current->signal->rlim + resource; + if ((new_rlim.rlim_max > old_rlim->rlim_max) && + !capable(CAP_SYS_RESOURCE)) + return -EPERM; + if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) + return -EPERM; + + retval = security_task_setrlimit(resource, &new_rlim); + if (retval) + return retval; + + if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + new_rlim.rlim_cur = 1; + } + + task_lock(current->group_leader); + *old_rlim = new_rlim; + task_unlock(current->group_leader); + + if (resource != RLIMIT_CPU) + goto out; + + /* + * RLIMIT_CPU handling. Note that the kernel fails to return an error + * code if it rejected the user's attempt to set RLIMIT_CPU. This is a + * very long-standing error, and fixing it now risks breakage of + * applications, so we live with it + */ + if (new_rlim.rlim_cur == RLIM_INFINITY) + goto out; + + update_rlimit_cpu(new_rlim.rlim_cur); +out: + return 0; +} + +/* + * It would make sense to put struct rusage in the task_struct, + * except that would make the task_struct be *really big*. After + * task_struct gets moved into malloc'ed memory, it would + * make sense to do this. It will make moving the rest of the information + * a lot simpler! (Which we're not doing right now because we're not + * measuring them yet). + * + * When sampling multiple threads for RUSAGE_SELF, under SMP we might have + * races with threads incrementing their own counters. But since word + * reads are atomic, we either get new values or old values and we don't + * care which for the sums. We always take the siglock to protect reading + * the c* fields from p->signal from races with exit.c updating those + * fields when reaping, so a sample either gets all the additions of a + * given child after it's reaped, or none so this sample is before reaping. + * + * Locking: + * We need to take the siglock for CHILDEREN, SELF and BOTH + * for the cases current multithreaded, non-current single threaded + * non-current multithreaded. Thread traversal is now safe with + * the siglock held. + * Strictly speaking, we donot need to take the siglock if we are current and + * single threaded, as no one else can take our signal_struct away, no one + * else can reap the children to update signal->c* counters, and no one else + * can race with the signal-> fields. If we do not take any lock, the + * signal-> fields could be read out of order while another thread was just + * exiting. So we should place a read memory barrier when we avoid the lock. + * On the writer side, write memory barrier is implied in __exit_signal + * as __exit_signal releases the siglock spinlock after updating the signal-> + * fields. But we don't do this yet to keep things simple. + * + */ + +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) +{ + r->ru_nvcsw += t->nvcsw; + r->ru_nivcsw += t->nivcsw; + r->ru_minflt += t->min_flt; + r->ru_majflt += t->maj_flt; + r->ru_inblock += task_io_get_inblock(t); + r->ru_oublock += task_io_get_oublock(t); +} + +static void k_getrusage(struct task_struct *p, int who, struct rusage *r) +{ + struct task_struct *t; + unsigned long flags; + cputime_t utime, stime; + struct task_cputime cputime; + + memset((char *) r, 0, sizeof *r); + utime = stime = cputime_zero; + + if (who == RUSAGE_THREAD) { + utime = task_utime(current); + stime = task_stime(current); + accumulate_thread_rusage(p, r); + goto out; + } + + if (!lock_task_sighand(p, &flags)) + return; + + switch (who) { + case RUSAGE_BOTH: + case RUSAGE_CHILDREN: + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + r->ru_inblock = p->signal->cinblock; + r->ru_oublock = p->signal->coublock; + + if (who == RUSAGE_CHILDREN) + break; + + case RUSAGE_SELF: + thread_group_cputime(p, &cputime); + utime = cputime_add(utime, cputime.utime); + stime = cputime_add(stime, cputime.stime); + r->ru_nvcsw += p->signal->nvcsw; + r->ru_nivcsw += p->signal->nivcsw; + r->ru_minflt += p->signal->min_flt; + r->ru_majflt += p->signal->maj_flt; + r->ru_inblock += p->signal->inblock; + r->ru_oublock += p->signal->oublock; + t = p; + do { + accumulate_thread_rusage(t, r); + t = next_thread(t); + } while (t != p); + break; + + default: + BUG(); + } + unlock_task_sighand(p, &flags); + +out: + cputime_to_timeval(utime, &r->ru_utime); + cputime_to_timeval(stime, &r->ru_stime); +} + +int getrusage(struct task_struct *p, int who, struct rusage __user *ru) +{ + struct rusage r; + k_getrusage(p, who, &r); + return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; +} + +SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) +{ + if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && + who != RUSAGE_THREAD) + return -EINVAL; + return getrusage(current, who, ru); +} + +SYSCALL_DEFINE1(umask, int, mask) +{ + mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); + return mask; +} + +SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, + unsigned long, arg4, unsigned long, arg5) +{ + struct task_struct *me = current; + unsigned char comm[sizeof(me->comm)]; + long error; + + error = security_task_prctl(option, arg2, arg3, arg4, arg5); + if (error != -ENOSYS) + return error; + + error = 0; + switch (option) { + case PR_SET_PDEATHSIG: + if (!valid_signal(arg2)) { + error = -EINVAL; + break; + } + me->pdeath_signal = arg2; + error = 0; + break; + case PR_GET_PDEATHSIG: + error = put_user(me->pdeath_signal, (int __user *)arg2); + break; + case PR_GET_DUMPABLE: + error = get_dumpable(me->mm); + break; + case PR_SET_DUMPABLE: + if (arg2 < 0 || arg2 > 1) { + error = -EINVAL; + break; + } + set_dumpable(me->mm, arg2); + error = 0; + break; + + case PR_SET_UNALIGN: + error = SET_UNALIGN_CTL(me, arg2); + break; + case PR_GET_UNALIGN: + error = GET_UNALIGN_CTL(me, arg2); + break; + case PR_SET_FPEMU: + error = SET_FPEMU_CTL(me, arg2); + break; + case PR_GET_FPEMU: + error = GET_FPEMU_CTL(me, arg2); + break; + case PR_SET_FPEXC: + error = SET_FPEXC_CTL(me, arg2); + break; + case PR_GET_FPEXC: + error = GET_FPEXC_CTL(me, arg2); + break; + case PR_GET_TIMING: + error = PR_TIMING_STATISTICAL; + break; + case PR_SET_TIMING: + if (arg2 != PR_TIMING_STATISTICAL) + error = -EINVAL; + else + error = 0; + break; + + case PR_SET_NAME: + comm[sizeof(me->comm)-1] = 0; + if (strncpy_from_user(comm, (char __user *)arg2, + sizeof(me->comm) - 1) < 0) + return -EFAULT; + set_task_comm(me, comm); + return 0; + case PR_GET_NAME: + get_task_comm(comm, me); + if (copy_to_user((char __user *)arg2, comm, + sizeof(comm))) + return -EFAULT; + return 0; + case PR_GET_ENDIAN: + error = GET_ENDIAN(me, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(me, arg2); + break; + + case PR_GET_SECCOMP: + error = prctl_get_seccomp(); + break; + case PR_SET_SECCOMP: + error = prctl_set_seccomp(arg2); + break; + case PR_GET_TSC: + error = GET_TSC_CTL(arg2); + break; + case PR_SET_TSC: + error = SET_TSC_CTL(arg2); + break; + case PR_GET_TIMERSLACK: + error = current->timer_slack_ns; + break; + case PR_SET_TIMERSLACK: + if (arg2 <= 0) + current->timer_slack_ns = + current->default_timer_slack_ns; + else + current->timer_slack_ns = arg2; + error = 0; + break; + default: + error = -EINVAL; + break; + } + return error; +} + +SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, + struct getcpu_cache __user *, unused) +{ + int err = 0; + int cpu = raw_smp_processor_id(); + if (cpup) + err |= put_user(cpu, cpup); + if (nodep) + err |= put_user(cpu_to_node(cpu), nodep); + return err ? -EFAULT : 0; +} + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static void argv_cleanup(char **argv, char **envp) +{ + argv_free(argv); +} + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + int argc; + char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); + static char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + int ret = -ENOMEM; + struct subprocess_info *info; + + if (argv == NULL) { + printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", + __func__, poweroff_cmd); + goto out; + } + + info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); + if (info == NULL) { + argv_free(argv); + goto out; + } + + call_usermodehelper_setcleanup(info, argv_cleanup); + + ret = call_usermodehelper_exec(info, UMH_NO_WAIT); + + out: + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + + /* I guess this should try to kick off some daemon to + sync and poweroff asap. Or not even bother syncing + if we're doing an emergency shutdown? */ + emergency_sync(); + kernel_power_off(); + } + + return ret; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); +#endif /* DDE_LINUX */ diff --git a/libdde-linux26/lib/src/kernel/time.c b/libdde-linux26/lib/src/kernel/time.c new file mode 100644 index 00000000..ce5b5fd4 --- /dev/null +++ b/libdde-linux26/lib/src/kernel/time.c @@ -0,0 +1,765 @@ +/* + * linux/kernel/time.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file contains the interface functions for the various + * time related system calls: time, stime, gettimeofday, settimeofday, + * adjtime + */ +/* + * Modification history kernel/time.c + * + * 1993-09-02 Philip Gladstone + * Created file with time related functions from sched.c and adjtimex() + * 1993-10-08 Torsten Duwe + * adjtime interface update and CMOS clock write code + * 1995-08-13 Torsten Duwe + * kernel PLL updated to 1994-12-13 specs (rfc-1589) + * 1999-01-16 Ulrich Windl + * Introduced error checking for many cases in adjtimex(). + * Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) + * (Even though the technical memorandum forbids it) + * 2004-07-14 Christoph Lameter + * Added getnstimeofday to allow the posix timer functions to return + * with nanosecond accuracy + */ + +#include <linux/module.h> +#include <linux/timex.h> +#include <linux/capability.h> +#include <linux/clocksource.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <linux/security.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/math64.h> +#include <linux/ptrace.h> + +#include <asm/uaccess.h> +#include <asm/unistd.h> + +#include "timeconst.h" +#include <ddekit/timer.h> + +/* + * The timezone where the local system is located. Used as a default by some + * programs who obtain this value by using gettimeofday. + */ +struct timezone sys_tz; + +EXPORT_SYMBOL(sys_tz); + +#ifdef __ARCH_WANT_SYS_TIME + +/* + * sys_time() can be implemented in user-level using + * sys_gettimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ +SYSCALL_DEFINE1(time, time_t __user *, tloc) +{ + time_t i = get_seconds(); + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +/* + * sys_stime() can be implemented in user-level using + * sys_settimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ + +SYSCALL_DEFINE1(stime, time_t __user *, tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_SYS_TIME */ + +SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + if (likely(tv != NULL)) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (copy_to_user(tv, &ktv, sizeof(ktv))) + return -EFAULT; + } + if (unlikely(tz != NULL)) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + return 0; +} + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +static inline void warp_clock(void) +{ + write_seqlock_irq(&xtime_lock); + wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; + xtime.tv_sec += sys_tz.tz_minuteswest * 60; + update_xtime_cache(0); + write_sequnlock_irq(&xtime_lock); + clock_was_set(); +} + +/* + * In case for some reason the CMOS clock has not already been running + * in UTC, but in some local time: The first time we set the timezone, + * we will warp the clock so that it is ticking UTC time instead of + * local time. Presumably, if someone is setting the timezone then we + * are running in an environment where the programs understand about + * timezones. This should be done at boot time in the /etc/rc script, + * as soon as possible, so that the clock can be set right. Otherwise, + * various programs will get confused when the clock gets warped. + */ + +int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) +{ + static int firsttime = 1; + int error = 0; + + if (tv && !timespec_valid(tv)) + return -EINVAL; + + error = security_settime(tv, tz); + if (error) + return error; + + if (tz) { + /* SMP safe, global irq locking makes it work. */ + sys_tz = *tz; + update_vsyscall_tz(); + if (firsttime) { + firsttime = 0; + if (!tv) + warp_clock(); + } + } + if (tv) + { + /* SMP safe, again the code in arch/foo/time.c should + * globally block out interrupts when it runs. + */ + return do_settimeofday(tv); + } + return 0; +} + +SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + struct timeval user_tv; + struct timespec new_ts; + struct timezone new_tz; + + if (tv) { + if (copy_from_user(&user_tv, tv, sizeof(*tv))) + return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} + +SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) +{ + struct timex txc; /* Local copy of parameter */ + int ret; + + /* Copy the user data space into the kernel copy + * structure. But bear in mind that the structures + * may change + */ + if(copy_from_user(&txc, txc_p, sizeof(struct timex))) + return -EFAULT; + ret = do_adjtimex(&txc); + return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; +} + +#ifndef DDE_LINUX +/** + * current_fs_time - Return FS time + * @sb: Superblock. + * + * Return the current time truncated to the time granularity supported by + * the fs. + */ +struct timespec current_fs_time(struct super_block *sb) +{ + struct timespec now = current_kernel_time(); + return timespec_trunc(now, sb->s_time_gran); +} +EXPORT_SYMBOL(current_fs_time); + +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int inline jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; +# else + return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int inline jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; +# else + return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); +#endif + +/** + * timespec_trunc - Truncate timespec to a granularity + * @t: Timespec + * @gran: Granularity in ns. + * + * Truncate a timespec to a granularity. gran must be smaller than a second. + * Always rounds down. + * + * This function should be only used for timestamps returned by + * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because + * it doesn't handle the better resolution of the latter. + */ +struct timespec timespec_trunc(struct timespec t, unsigned gran) +{ + /* + * Division is pretty slow so avoid it for common cases. + * Currently current_kernel_time() never returns better than + * jiffies resolution. Exploit that. + */ + if (gran <= jiffies_to_usecs(1) * 1000) { + /* nothing */ + } else if (gran == 1000000000) { + t.tv_nsec = 0; + } else { + t.tv_nsec -= t.tv_nsec % gran; + } + return t; +} +EXPORT_SYMBOL(timespec_trunc); + +#ifndef CONFIG_GENERIC_TIME +/* + * Simulate gettimeofday using do_gettimeofday which only allows a timeval + * and therefore only yields usec accuracy + */ +void getnstimeofday(struct timespec *tv) +{ + struct timeval x; + + do_gettimeofday(&x); + tv->tv_sec = x.tv_sec; + tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; +} +EXPORT_SYMBOL_GPL(getnstimeofday); +#endif + +/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. + * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 + * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. + * + * [For the Julian calendar (which was used in Russia before 1917, + * Britain & colonies before 1752, anywhere else before 1582, + * and is still in use by some communities) leave out the + * -year/100+year/400 terms, and add 10.] + * + * This algorithm was first published by Gauss (I think). + * + * WARNING: this function will overflow on 2106-02-07 06:28:16 on + * machines where long is 32-bit! (However, as time_t is signed, we + * will already get problems at other places on 2038-01-19 03:14:08) + */ +unsigned long +mktime(const unsigned int year0, const unsigned int mon0, + const unsigned int day, const unsigned int hour, + const unsigned int min, const unsigned int sec) +{ + unsigned int mon = mon0, year = year0; + + /* 1..12 -> 11,12,1..10 */ + if (0 >= (int) (mon -= 2)) { + mon += 12; /* Puts Feb last since it has leap day */ + year -= 1; + } + + return ((((unsigned long) + (year/4 - year/100 + year/400 + 367*mon/12 + day) + + year*365 - 719499 + )*24 + hour /* now have hours */ + )*60 + min /* now have minutes */ + )*60 + sec; /* finally seconds */ +} + +EXPORT_SYMBOL(mktime); + +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) +{ + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec); + +/** + * ns_to_timespec - Convert nanoseconds to timespec + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec representation of the nsec parameter. + */ +struct timespec ns_to_timespec(const s64 nsec) +{ + struct timespec ts; + s32 rem; + + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec); + +/** + * ns_to_timeval - Convert nanoseconds to timeval + * @nsec: the nanoseconds value to be converted + * + * Returns the timeval representation of the nsec parameter. + */ +struct timeval ns_to_timeval(const s64 nsec) +{ + struct timespec ts = ns_to_timespec(nsec); + struct timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_timeval); + +#ifndef DDE_LINUX +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else + return (j * MSEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else + return (j * USEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + +/* + * When we convert to jiffies then we interpret incoming values + * the following way: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor + * + * We must also be careful about 32-bit overflows. + */ +unsigned long msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + /* + * HZ is equal to or smaller than 1000, and 1000 is a nice + * round multiple of HZ, divide with the factor between them, + * but round upwards: + */ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + /* + * HZ is larger than 1000, and HZ is a nice round multiple of + * 1000 - simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot + * overflow: + */ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return m * (HZ / MSEC_PER_SEC); +#else + /* + * Generic case - multiply, round and divide. But first + * check that if we are doing a net multiplication, that + * we wouldn't overflow: + */ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) + >> MSEC_TO_HZ_SHR32; +#endif +} +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return u * (HZ / USEC_PER_SEC); +#else + return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) + >> USEC_TO_HZ_SHR32; +#endif +} +EXPORT_SYMBOL(usecs_to_jiffies); +#else /* DDE_LINUX */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ + return (j*1000) / HZ; +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ + return (j*1000000) / HZ; +} +EXPORT_SYMBOL(jiffies_to_usecs); + +unsigned long msecs_to_jiffies(const unsigned int m) +{ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return (m * HZ + MSEC_PER_SEC) / MSEC_PER_SEC; +} +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; +} +EXPORT_SYMBOL(usecs_to_jiffies); +#endif + +/* + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + */ +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + unsigned long sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} +EXPORT_SYMBOL(timespec_to_jiffies); + +void +jiffies_to_timespec(const unsigned long jiffiesv, struct timespec *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + value->tv_sec = div_u64_rem((u64)jiffiesv * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_nsec = rem; +} +EXPORT_SYMBOL(jiffies_to_timespec); + +/* Same for "timeval" + * + * Well, almost. The problem here is that the real system resolution is + * in nanoseconds and the value being converted is in micro seconds. + * Also for some machines (those that use HZ = 1024, in-particular), + * there is a LARGE error in the tick size in microseconds. + + * The solution we use is to do the rounding AFTER we convert the + * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. + * Instruction wise, this should cost only an additional add with carry + * instruction above the way it was done above. + */ +unsigned long +timeval_to_jiffies(const struct timeval *value) +{ + unsigned long sec = value->tv_sec; + long usec = value->tv_usec; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + usec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> + (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; +} +EXPORT_SYMBOL(timeval_to_jiffies); + +void jiffies_to_timeval(const unsigned long jiffiesv, struct timeval *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + + value->tv_sec = div_u64_rem((u64)jiffiesv * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_usec = rem / NSEC_PER_USEC; +} +EXPORT_SYMBOL(jiffies_to_timeval); + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ +clock_t jiffies_to_clock_t(long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + return x * (USER_HZ / HZ); +# else + return x / (HZ / USER_HZ); +# endif +#else + return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); +#endif +} +EXPORT_SYMBOL(jiffies_to_clock_t); + +#ifndef DDE_LINUX +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + return div_u64((u64)x * HZ, USER_HZ); +#endif +} +#else +unsigned long clock_t_to_jiffies(unsigned long x) +{ + assert (HZ); + assert (USER_HZ); + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +} +#endif /* DDE_LINUX */ +EXPORT_SYMBOL(clock_t_to_jiffies); + +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + x = div_u64(x * USER_HZ, HZ); +# elif HZ > USER_HZ + x = div_u64(x, HZ / USER_HZ); +# else + /* Nothing to do */ +# endif +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} +EXPORT_SYMBOL(jiffies_64_to_clock_t); + +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + return div_u64(x, NSEC_PER_SEC / USER_HZ); +#elif (USER_HZ % 512) == 0 + return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); +#endif +} + +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&xtime_lock); + ret = jiffies_64; + } while (read_seqretry(&xtime_lock, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + +/* + * Add two timespec values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0) + */ +struct timespec timespec_add_safe(const struct timespec lhs, + const struct timespec rhs) +{ + struct timespec res; + + set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) + res.tv_sec = TIME_T_MAX; + + return res; +} diff --git a/libdde-linux26/lib/src/kernel/timeconst.pl b/libdde-linux26/lib/src/kernel/timeconst.pl new file mode 100755 index 00000000..d459895f --- /dev/null +++ b/libdde-linux26/lib/src/kernel/timeconst.pl @@ -0,0 +1,378 @@ +#!/usr/bin/perl +# ----------------------------------------------------------------------- +# +# Copyright 2007-2008 rPath, Inc. - All Rights Reserved +# +# This file is part of the Linux kernel, and is made available under +# the terms of the GNU General Public License version 2 or (at your +# option) any later version; incorporated herein by reference. +# +# ----------------------------------------------------------------------- +# + +# +# Usage: timeconst.pl HZ > timeconst.h +# + +# Precomputed values for systems without Math::BigInt +# Generated by: +# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200 +%canned_values = ( + 24 => [ + '0xa6aaaaab','0x2aaaaaa',26, + 125,3, + '0xc49ba5e4','0x1fbe76c8b4',37, + 3,125, + '0xa2c2aaab','0xaaaa',16, + 125000,3, + '0xc9539b89','0x7fffbce4217d',47, + 3,125000, + ], 32 => [ + '0xfa000000','0x6000000',27, + 125,4, + '0x83126e98','0xfdf3b645a',36, + 4,125, + '0xf4240000','0x0',17, + 31250,1, + '0x8637bd06','0x3fff79c842fa',46, + 1,31250, + ], 48 => [ + '0xa6aaaaab','0x6aaaaaa',27, + 125,6, + '0xc49ba5e4','0xfdf3b645a',36, + 6,125, + '0xa2c2aaab','0x15555',17, + 62500,3, + '0xc9539b89','0x3fffbce4217d',46, + 3,62500, + ], 64 => [ + '0xfa000000','0xe000000',28, + 125,8, + '0x83126e98','0x7ef9db22d',35, + 8,125, + '0xf4240000','0x0',18, + 15625,1, + '0x8637bd06','0x1fff79c842fa',45, + 1,15625, + ], 100 => [ + '0xa0000000','0x0',28, + 10,1, + '0xcccccccd','0x733333333',35, + 1,10, + '0x9c400000','0x0',18, + 10000,1, + '0xd1b71759','0x1fff2e48e8a7',45, + 1,10000, + ], 122 => [ + '0x8325c53f','0xfbcda3a',28, + 500,61, + '0xf9db22d1','0x7fbe76c8b',35, + 61,500, + '0x8012e2a0','0x3ef36',18, + 500000,61, + '0xffda4053','0x1ffffbce4217',45, + 61,500000, + ], 128 => [ + '0xfa000000','0x1e000000',29, + 125,16, + '0x83126e98','0x3f7ced916',34, + 16,125, + '0xf4240000','0x40000',19, + 15625,2, + '0x8637bd06','0xfffbce4217d',44, + 2,15625, + ], 200 => [ + '0xa0000000','0x0',29, + 5,1, + '0xcccccccd','0x333333333',34, + 1,5, + '0x9c400000','0x0',19, + 5000,1, + '0xd1b71759','0xfff2e48e8a7',44, + 1,5000, + ], 250 => [ + '0x80000000','0x0',29, + 4,1, + '0x80000000','0x180000000',33, + 1,4, + '0xfa000000','0x0',20, + 4000,1, + '0x83126e98','0x7ff7ced9168',43, + 1,4000, + ], 256 => [ + '0xfa000000','0x3e000000',30, + 125,32, + '0x83126e98','0x1fbe76c8b',33, + 32,125, + '0xf4240000','0xc0000',20, + 15625,4, + '0x8637bd06','0x7ffde7210be',43, + 4,15625, + ], 300 => [ + '0xd5555556','0x2aaaaaaa',30, + 10,3, + '0x9999999a','0x1cccccccc',33, + 3,10, + '0xd0555556','0xaaaaa',20, + 10000,3, + '0x9d495183','0x7ffcb923a29',43, + 3,10000, + ], 512 => [ + '0xfa000000','0x7e000000',31, + 125,64, + '0x83126e98','0xfdf3b645',32, + 64,125, + '0xf4240000','0x1c0000',21, + 15625,8, + '0x8637bd06','0x3ffef39085f',42, + 8,15625, + ], 1000 => [ + '0x80000000','0x0',31, + 1,1, + '0x80000000','0x0',31, + 1,1, + '0xfa000000','0x0',22, + 1000,1, + '0x83126e98','0x1ff7ced9168',41, + 1,1000, + ], 1024 => [ + '0xfa000000','0xfe000000',32, + 125,128, + '0x83126e98','0x7ef9db22',31, + 128,125, + '0xf4240000','0x3c0000',22, + 15625,16, + '0x8637bd06','0x1fff79c842f',41, + 16,15625, + ], 1200 => [ + '0xd5555556','0xd5555555',32, + 5,6, + '0x9999999a','0x66666666',31, + 6,5, + '0xd0555556','0x2aaaaa',22, + 2500,3, + '0x9d495183','0x1ffcb923a29',41, + 3,2500, + ] +); + +$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;'; + +sub bint($) +{ + my($x) = @_; + return Math::BigInt->new($x); +} + +# +# Constants for division by reciprocal multiplication. +# (bits, numerator, denominator) +# +sub fmul($$$) +{ + my ($b,$n,$d) = @_; + + $n = bint($n); + $d = bint($d); + + return scalar (($n << $b)+$d-bint(1))/$d; +} + +sub fadj($$$) +{ + my($b,$n,$d) = @_; + + $n = bint($n); + $d = bint($d); + + $d = $d/bgcd($n, $d); + return scalar (($d-bint(1)) << $b)/$d; +} + +sub fmuls($$$) { + my($b,$n,$d) = @_; + my($s,$m); + my($thres) = bint(1) << ($b-1); + + $n = bint($n); + $d = bint($d); + + for ($s = 0; 1; $s++) { + $m = fmul($s,$n,$d); + return $s if ($m >= $thres); + } + return 0; +} + +# Generate a hex value if the result fits in 64 bits; +# otherwise skip. +sub bignum_hex($) { + my($x) = @_; + my $s = $x->as_hex(); + + return (length($s) > 18) ? undef : $s; +} + +# Provides mul, adj, and shr factors for a specific +# (bit, time, hz) combination +sub muladj($$$) { + my($b, $t, $hz) = @_; + my $s = fmuls($b, $t, $hz); + my $m = fmul($s, $t, $hz); + my $a = fadj($s, $t, $hz); + return (bignum_hex($m), bignum_hex($a), $s); +} + +# Provides numerator, denominator values +sub numden($$) { + my($n, $d) = @_; + my $g = bgcd($n, $d); + return ($n/$g, $d/$g); +} + +# All values for a specific (time, hz) combo +sub conversions($$) { + my ($t, $hz) = @_; + my @val = (); + + # HZ_TO_xx + push(@val, muladj(32, $t, $hz)); + push(@val, numden($t, $hz)); + + # xx_TO_HZ + push(@val, muladj(32, $hz, $t)); + push(@val, numden($hz, $t)); + + return @val; +} + +sub compute_values($) { + my($hz) = @_; + my @val = (); + my $s, $m, $a, $g; + + if (!$has_bigint) { + die "$0: HZ == $hz not canned and ". + "Math::BigInt not available\n"; + } + + # MSEC conversions + push(@val, conversions(1000, $hz)); + + # USEC conversions + push(@val, conversions(1000000, $hz)); + + return @val; +} + +sub outputval($$) +{ + my($name, $val) = @_; + my $csuf; + + if (defined($val)) { + if ($name !~ /SHR/) { + $val = "U64_C($val)"; + } + printf "#define %-23s %s\n", $name.$csuf, $val.$csuf; + } +} + +sub output($@) +{ + my($hz, @val) = @_; + my $pfx, $bit, $suf, $s, $m, $a; + + print "/* Automatically generated by kernel/timeconst.pl */\n"; + print "/* Conversion constants for HZ == $hz */\n"; + print "\n"; + print "#ifndef KERNEL_TIMECONST_H\n"; + print "#define KERNEL_TIMECONST_H\n"; + print "\n"; + + print "#include <linux/param.h>\n"; + print "#include <linux/types.h>\n"; + + print "\n"; + print "#if HZ != $hz && !defined(DDE_LINUX)\n"; + print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n"; + print "#endif\n"; + print "\n"; + + foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', + 'HZ_TO_USEC','USEC_TO_HZ') { + foreach $bit (32) { + foreach $suf ('MUL', 'ADJ', 'SHR') { + outputval("${pfx}_$suf$bit", shift(@val)); + } + } + foreach $suf ('NUM', 'DEN') { + outputval("${pfx}_$suf", shift(@val)); + } + } + + print "\n"; + print "#endif /* KERNEL_TIMECONST_H */\n"; +} + +# Pretty-print Perl values +sub perlvals(@) { + my $v; + my @l = (); + + foreach $v (@_) { + if (!defined($v)) { + push(@l, 'undef'); + } elsif ($v =~ /^0x/) { + push(@l, "\'".$v."\'"); + } else { + push(@l, $v.''); + } + } + return join(',', @l); +} + +($hz) = @ARGV; + +# Use this to generate the %canned_values structure +if ($hz eq '--can') { + shift(@ARGV); + @hzlist = sort {$a <=> $b} (@ARGV); + + print "# Precomputed values for systems without Math::BigInt\n"; + print "# Generated by:\n"; + print "# timeconst.pl --can ", join(' ', @hzlist), "\n"; + print "\%canned_values = (\n"; + my $pf = "\t"; + foreach $hz (@hzlist) { + my @values = compute_values($hz); + print "$pf$hz => [\n"; + while (scalar(@values)) { + my $bit; + foreach $bit (32) { + my $m = shift(@values); + my $a = shift(@values); + my $s = shift(@values); + print "\t\t", perlvals($m,$a,$s), ",\n"; + } + my $n = shift(@values); + my $d = shift(@values); + print "\t\t", perlvals($n,$d), ",\n"; + } + print "\t]"; + $pf = ', '; + } + print "\n);\n"; +} else { + $hz += 0; # Force to number + if ($hz < 1) { + die "Usage: $0 HZ\n"; + } + + @val = @{$canned_values{$hz}}; + if (!defined(@val)) { + @val = compute_values($hz); + } + output($hz, @val); +} +exit 0; diff --git a/libdde-linux26/lib/src/kernel/timer.c b/libdde-linux26/lib/src/kernel/timer.c new file mode 100644 index 00000000..951d6ffc --- /dev/null +++ b/libdde-linux26/lib/src/kernel/timer.c @@ -0,0 +1,1590 @@ +/* + * linux/kernel/timer.c + * + * Kernel internal timers, basic process system calls + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1999-03-10 Improved NTP compatibility by Ulrich Windl + * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love + * 2000-10-05 Implemented scalable SMP per-CPU timer handling. + * Copyright (C) 2000, 2001, 2002 Ingo Molnar + * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar + */ + +#include <linux/kernel_stat.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pid_namespace.h> +#include <linux/notifier.h> +#include <linux/thread_info.h> +#include <linux/time.h> +#include <linux/jiffies.h> +#include <linux/posix-timers.h> +#include <linux/cpu.h> +#include <linux/syscalls.h> +#include <linux/delay.h> +#include <linux/tick.h> +#include <linux/kallsyms.h> + +#include <asm/uaccess.h> +#include <asm/unistd.h> +#include <asm/div64.h> +#include <asm/timex.h> +#include <asm/io.h> + +#include <ddekit/timer.h> + +#ifndef DDE_LINUX + +u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + +/* + * per-CPU timer vector definitions: + */ +#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) +#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) + +struct tvec { + struct list_head vec[TVN_SIZE]; +}; + +struct tvec_root { + struct list_head vec[TVR_SIZE]; +}; + +struct tvec_base { + spinlock_t lock; + struct timer_list *running_timer; + unsigned long timer_jiffies; + struct tvec_root tv1; + struct tvec tv2; + struct tvec tv3; + struct tvec tv4; + struct tvec tv5; +} ____cacheline_aligned; + +struct tvec_base boot_tvec_bases; +EXPORT_SYMBOL(boot_tvec_bases); +static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; + +/* + * Note that all tvec_bases are 2 byte aligned and lower bit of + * base in timer_list is guaranteed to be zero. Use the LSB for + * the new flag to indicate whether the timer is deferrable + */ +#define TBASE_DEFERRABLE_FLAG (0x1) + +/* Functions below help us manage 'deferrable' flag */ +static inline unsigned int tbase_get_deferrable(struct tvec_base *base) +{ + return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); +} + +static inline struct tvec_base *tbase_get_base(struct tvec_base *base) +{ + return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); +} + +static inline void timer_set_deferrable(struct timer_list *timer) +{ + timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | + TBASE_DEFERRABLE_FLAG)); +} + +static inline void +timer_set_base(struct timer_list *timer, struct tvec_base *new_base) +{ + timer->base = (struct tvec_base *)((unsigned long)(new_base) | + tbase_get_deferrable(timer->base)); +} +#endif /* DDE_LINUX */ + +static unsigned long round_jiffies_common(unsigned long j, int cpu, + bool force_up) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + * But never round down if @force_up is set. + */ + if (rem < HZ/4 && !force_up) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + if (j <= jiffies) /* rounding ate our timeout entirely; */ + return original; + return j; +} + +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, false); +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, false) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), false); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + +/** + * __round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, true); +} +EXPORT_SYMBOL_GPL(__round_jiffies_up); + +/** + * __round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, true) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); + +/** + * round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * This is the same as round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), true); +} +EXPORT_SYMBOL_GPL(round_jiffies_up); + +/** + * round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * This is the same as round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up_relative(unsigned long j) +{ + return __round_jiffies_up_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_up_relative); + + +#ifndef DDE_LINUX +static inline void set_running_timer(struct tvec_base *base, + struct timer_list *timer) +{ +#ifdef CONFIG_SMP + base->running_timer = timer; +#endif +} + +static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +{ + unsigned long expires = timer->expires; + unsigned long idx = expires - base->timer_jiffies; + struct list_head *vec; + + if (idx < TVR_SIZE) { + int i = expires & TVR_MASK; + vec = base->tv1.vec + i; + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { + int i = (expires >> TVR_BITS) & TVN_MASK; + vec = base->tv2.vec + i; + } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; + vec = base->tv3.vec + i; + } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; + vec = base->tv4.vec + i; + } else if ((signed long) idx < 0) { + /* + * Can happen if you add a timer with expires == jiffies, + * or you set a timer to go off in the past + */ + vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); + } else { + int i; + /* If the timeout is larger than 0xffffffff on 64-bit + * architectures then we use the maximum timeout: + */ + if (idx > 0xffffffffUL) { + idx = 0xffffffffUL; + expires = idx + base->timer_jiffies; + } + i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; + vec = base->tv5.vec + i; + } + /* + * Timers are FIFO: + */ + list_add_tail(&timer->entry, vec); +} + +#ifdef CONFIG_TIMER_STATS +void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} + +static void timer_stats_account_timer(struct timer_list *timer) +{ + unsigned int flag = 0; + + if (unlikely(tbase_get_deferrable(timer->base))) + flag |= TIMER_STATS_FLAG_DEFERRABLE; + + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, flag); +} + +#else +static void timer_stats_account_timer(struct timer_list *timer) {} +#endif + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr timer_debug_descr; + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int timer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_init(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int timer_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The timer was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (timer->entry.next == NULL && + timer->entry.prev == TIMER_ENTRY_STATIC) { + debug_object_init(timer, &timer_debug_descr); + debug_object_activate(timer, &timer_debug_descr); + return 0; + } else { + WARN_ON_ONCE(1); + } + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int timer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_free(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr timer_debug_descr = { + .name = "timer_list", + .fixup_init = timer_fixup_init, + .fixup_activate = timer_fixup_activate, + .fixup_free = timer_fixup_free, +}; + +static inline void debug_timer_init(struct timer_list *timer) +{ + debug_object_init(timer, &timer_debug_descr); +} + +static inline void debug_timer_activate(struct timer_list *timer) +{ + debug_object_activate(timer, &timer_debug_descr); +} + +static inline void debug_timer_deactivate(struct timer_list *timer) +{ + debug_object_deactivate(timer, &timer_debug_descr); +} + +static inline void debug_timer_free(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} + +static void __init_timer(struct timer_list *timer); + +void init_timer_on_stack(struct timer_list *timer) +{ + debug_object_init_on_stack(timer, &timer_debug_descr); + __init_timer(timer); +} +EXPORT_SYMBOL_GPL(init_timer_on_stack); + +void destroy_timer_on_stack(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_timer_on_stack); + +#else +static inline void debug_timer_init(struct timer_list *timer) { } +static inline void debug_timer_activate(struct timer_list *timer) { } +static inline void debug_timer_deactivate(struct timer_list *timer) { } +#endif + +static void __init_timer(struct timer_list *timer) +{ + timer->entry.next = NULL; + timer->base = __raw_get_cpu_var(tvec_bases); +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif +} + +/** + * init_timer - initialize a timer. + * @timer: the timer to be initialized + * + * init_timer() must be done to a timer prior calling *any* of the + * other timer functions. + */ +void init_timer(struct timer_list *timer) +{ + debug_timer_init(timer); + __init_timer(timer); +} +EXPORT_SYMBOL(init_timer); + +void init_timer_deferrable(struct timer_list *timer) +{ + init_timer(timer); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL(init_timer_deferrable); + +static inline void detach_timer(struct timer_list *timer, + int clear_pending) +{ + struct list_head *entry = &timer->entry; + + debug_timer_deactivate(timer); + + __list_del(entry->prev, entry->next); + if (clear_pending) + entry->next = NULL; + entry->prev = LIST_POISON2; +} + +/* + * We are using hashed locking: holding per_cpu(tvec_bases).lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on ->tvX lists. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = NULL and drop the lock: the timer remains + * locked. + */ +static struct tvec_base *lock_timer_base(struct timer_list *timer, + unsigned long *flags) + __acquires(timer->base->lock) +{ + struct tvec_base *base; + + for (;;) { + struct tvec_base *prelock_base = timer->base; + base = tbase_get_base(prelock_base); + if (likely(base != NULL)) { + spin_lock_irqsave(&base->lock, *flags); + if (likely(prelock_base == timer->base)) + return base; + /* The timer has migrated to another CPU */ + spin_unlock_irqrestore(&base->lock, *flags); + } + cpu_relax(); + } +} + +int __mod_timer(struct timer_list *timer, unsigned long expires) +{ + struct tvec_base *base, *new_base; + unsigned long flags; + int ret = 0; + + timer_stats_timer_set_start_info(timer); + BUG_ON(!timer->function); + + base = lock_timer_base(timer, &flags); + + if (timer_pending(timer)) { + detach_timer(timer, 0); + ret = 1; + } + + debug_timer_activate(timer); + + new_base = __get_cpu_var(tvec_bases); + + if (base != new_base) { + /* + * We are trying to schedule the timer on the local CPU. + * However we can't change timer's base while it is running, + * otherwise del_timer_sync() can't detect that the timer's + * handler yet has not finished. This also guarantees that + * the timer is serialized wrt itself. + */ + if (likely(base->running_timer != timer)) { + /* See the comment in lock_timer_base() */ + timer_set_base(timer, NULL); + spin_unlock(&base->lock); + base = new_base; + spin_lock(&base->lock); + timer_set_base(timer, base); + } + } + + timer->expires = expires; + internal_add_timer(base, timer); + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + +EXPORT_SYMBOL(__mod_timer); + +/** + * add_timer_on - start a timer on a particular CPU + * @timer: the timer to be added + * @cpu: the CPU to start it on + * + * This is not very scalable on SMP. Double adds are not possible. + */ +void add_timer_on(struct timer_list *timer, int cpu) +{ + struct tvec_base *base = per_cpu(tvec_bases, cpu); + unsigned long flags; + + timer_stats_timer_set_start_info(timer); + BUG_ON(timer_pending(timer) || !timer->function); + spin_lock_irqsave(&base->lock, flags); + timer_set_base(timer, base); + debug_timer_activate(timer); + internal_add_timer(base, timer); + /* + * Check whether the other CPU is idle and needs to be + * triggered to reevaluate the timer wheel when nohz is + * active. We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to idle can not evaluate + * the timer wheel. + */ + wake_up_idle_cpu(cpu); + spin_unlock_irqrestore(&base->lock, flags); +} + +/** + * mod_timer - modify a timer's timeout + * @timer: the timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer() is a more efficient way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * + * mod_timer(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + * + * Note that if there are multiple unserialized concurrent users of the + * same timer, then mod_timer() is the only safe way to modify the timeout, + * since add_timer() cannot modify an already running timer. + * + * The function returns whether it has modified a pending timer or not. + * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an + * active timer returns 1.) + */ +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + BUG_ON(!timer->function); + + timer_stats_timer_set_start_info(timer); + /* + * This is a common optimization triggered by the + * networking code - if the timer is re-modified + * to be the same thing then just return: + */ + if (timer->expires == expires && timer_pending(timer)) + return 1; + + return __mod_timer(timer, expires); +} + +EXPORT_SYMBOL(mod_timer); + +/** + * del_timer - deactive a timer. + * @timer: the timer to be deactivated + * + * del_timer() deactivates a timer - this works on both active and inactive + * timers. + * + * The function returns whether it has deactivated a pending timer or not. + * (ie. del_timer() of an inactive timer returns 0, del_timer() of an + * active timer returns 1.) + */ +int del_timer(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = 0; + + timer_stats_timer_clear_start_info(timer); + if (timer_pending(timer)) { + base = lock_timer_base(timer, &flags); + if (timer_pending(timer)) { + detach_timer(timer, 1); + ret = 1; + } + spin_unlock_irqrestore(&base->lock, flags); + } + + return ret; +} + +EXPORT_SYMBOL(del_timer); + +#ifdef CONFIG_SMP +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: timer do del + * + * This function tries to deactivate a timer. Upon successful (ret >= 0) + * exit the timer is not queued and the handler is not running on any CPU. + * + * It must not be called from interrupt contexts. + */ +int try_to_del_timer_sync(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = -1; + + base = lock_timer_base(timer, &flags); + + if (base->running_timer == timer) + goto out; + + ret = 0; + if (timer_pending(timer)) { + detach_timer(timer, 1); + ret = 1; + } +out: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + +EXPORT_SYMBOL(try_to_del_timer_sync); + +/** + * del_timer_sync - deactivate a timer and wait for the handler to finish. + * @timer: the timer to be deactivated + * + * This function only differs from del_timer() on SMP: besides deactivating + * the timer it also makes sure the handler has finished executing on other + * CPUs. + * + * Synchronization rules: Callers must prevent restarting of the timer, + * otherwise this function is meaningless. It must not be called from + * interrupt contexts. The caller must not hold locks which would prevent + * completion of the timer's handler. The timer's handler must not call + * add_timer_on(). Upon exit the timer is not queued and the handler is + * not running on any CPU. + * + * The function returns whether it has deactivated a pending timer or not. + */ +int del_timer_sync(struct timer_list *timer) +{ + for (;;) { + int ret = try_to_del_timer_sync(timer); + if (ret >= 0) + return ret; + cpu_relax(); + } +} + +EXPORT_SYMBOL(del_timer_sync); +#endif + +static int cascade(struct tvec_base *base, struct tvec *tv, int index) +{ + /* cascade all the timers from tv up one level */ + struct timer_list *timer, *tmp; + struct list_head tv_list; + + list_replace_init(tv->vec + index, &tv_list); + + /* + * We are removing _all_ timers from the list, so we + * don't have to detach them individually. + */ + list_for_each_entry_safe(timer, tmp, &tv_list, entry) { + BUG_ON(tbase_get_base(timer->base) != base); + internal_add_timer(base, timer); + } + + return index; +} + +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) + +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + * + * This function cascades all vectors and executes all expired timer + * vectors. + */ +static inline void __run_timers(struct tvec_base *base) +{ + struct timer_list *timer; + + spin_lock_irq(&base->lock); + while (time_after_eq(jiffies, base->timer_jiffies)) { + struct list_head work_list; + struct list_head *head = &work_list; + int index = base->timer_jiffies & TVR_MASK; + + /* + * Cascade timers: + */ + if (!index && + (!cascade(base, &base->tv2, INDEX(0))) && + (!cascade(base, &base->tv3, INDEX(1))) && + !cascade(base, &base->tv4, INDEX(2))) + cascade(base, &base->tv5, INDEX(3)); + ++base->timer_jiffies; + list_replace_init(base->tv1.vec + index, &work_list); + while (!list_empty(head)) { + void (*fn)(unsigned long); + unsigned long data; + + timer = list_first_entry(head, struct timer_list,entry); + fn = timer->function; + data = timer->data; + + timer_stats_account_timer(timer); + + set_running_timer(base, timer); + detach_timer(timer, 1); + spin_unlock_irq(&base->lock); + { + int preempt_count = preempt_count(); + fn(data); + if (preempt_count != preempt_count()) { + printk(KERN_ERR "huh, entered %p " + "with preempt_count %08x, exited" + " with %08x?\n", + fn, preempt_count, + preempt_count()); + BUG(); + } + } + spin_lock_irq(&base->lock); + } + } + set_running_timer(base, NULL); + spin_unlock_irq(&base->lock); +} + +#ifdef CONFIG_NO_HZ +/* + * Find out when the next timer event is due to happen. This + * is used on S/390 to stop all activity when a cpus is idle. + * This functions needs to be called disabled. + */ +static unsigned long __next_timer_interrupt(struct tvec_base *base) +{ + unsigned long timer_jiffies = base->timer_jiffies; + unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; + int index, slot, array, found = 0; + struct timer_list *nte; + struct tvec *varray[4]; + + /* Look for timer events in tv1. */ + index = slot = timer_jiffies & TVR_MASK; + do { + list_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + + found = 1; + expires = nte->expires; + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + goto cascade; + return expires; + } + slot = (slot + 1) & TVR_MASK; + } while (slot != index); + +cascade: + /* Calculate the next cascade event */ + if (index) + timer_jiffies += TVR_SIZE - index; + timer_jiffies >>= TVR_BITS; + + /* Check tv2-tv5. */ + varray[0] = &base->tv2; + varray[1] = &base->tv3; + varray[2] = &base->tv4; + varray[3] = &base->tv5; + + for (array = 0; array < 4; array++) { + struct tvec *varp = varray[array]; + + index = slot = timer_jiffies & TVN_MASK; + do { + list_for_each_entry(nte, varp->vec + slot, entry) { + found = 1; + if (time_before(nte->expires, expires)) + expires = nte->expires; + } + /* + * Do we still search for the first timer or are + * we looking up the cascade buckets ? + */ + if (found) { + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + break; + return expires; + } + slot = (slot + 1) & TVN_MASK; + } while (slot != index); + + if (index) + timer_jiffies += TVN_SIZE - index; + timer_jiffies >>= TVN_BITS; + } + return expires; +} + +/* + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +static unsigned long cmp_next_hrtimer_event(unsigned long now, + unsigned long expires) +{ + ktime_t hr_delta = hrtimer_get_next_event(); + struct timespec tsdelta; + unsigned long delta; + + if (hr_delta.tv64 == KTIME_MAX) + return expires; + + /* + * Expired timer available, let it expire in the next tick + */ + if (hr_delta.tv64 <= 0) + return now + 1; + + tsdelta = ktime_to_timespec(hr_delta); + delta = timespec_to_jiffies(&tsdelta); + + /* + * Limit the delta to the max value, which is checked in + * tick_nohz_stop_sched_tick(): + */ + if (delta > NEXT_TIMER_MAX_DELTA) + delta = NEXT_TIMER_MAX_DELTA; + + /* + * Take rounding errors in to account and make sure, that it + * expires in the next tick. Otherwise we go into an endless + * ping pong due to tick_nohz_stop_sched_tick() retriggering + * the timer softirq + */ + if (delta < 1) + delta = 1; + now += delta; + if (time_before(now, expires)) + return now; + return expires; +} + +/** + * get_next_timer_interrupt - return the jiffy of the next pending timer + * @now: current time (in jiffies) + */ +unsigned long get_next_timer_interrupt(unsigned long now) +{ + struct tvec_base *base = __get_cpu_var(tvec_bases); + unsigned long expires; + + spin_lock(&base->lock); + expires = __next_timer_interrupt(base); + spin_unlock(&base->lock); + + if (time_before_eq(expires, now)) + return now; + + return cmp_next_hrtimer_event(now, expires); +} +#endif + +/* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + int cpu = smp_processor_id(); + + /* Note: this timer irq context must be accounted for as well. */ + account_process_tick(p, user_tick); + run_local_timers(); + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, user_tick); + printk_tick(); + scheduler_tick(); + run_posix_cpu_timers(p); +} + +/* + * Nr of active tasks - counted in fixed-point numbers + */ +static unsigned long count_active_tasks(void) +{ + return nr_active() * FIXED_1; +} + +/* + * Hmm.. Changed this, as the GNU make sources (load.c) seems to + * imply that avenrun[] is the standard name for this kind of thing. + * Nothing else seems to be standardized: the fractional size etc + * all seem to differ on different machines. + * + * Requires xtime_lock to access. + */ +unsigned long avenrun[3]; + +EXPORT_SYMBOL(avenrun); + +/* + * calc_load - given tick count, update the avenrun load estimates. + * This is called while holding a write_lock on xtime_lock. + */ +static inline void calc_load(unsigned long ticks) +{ + unsigned long active_tasks; /* fixed-point */ + static int count = LOAD_FREQ; + + count -= ticks; + if (unlikely(count < 0)) { + active_tasks = count_active_tasks(); + do { + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); + count += LOAD_FREQ; + } while (count < 0); + } +} + +/* + * This function runs timers and the timer-tq in bottom half context. + */ +static void run_timer_softirq(struct softirq_action *h) +{ + struct tvec_base *base = __get_cpu_var(tvec_bases); + + hrtimer_run_pending(); + + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); +} + +/* + * Called by the local, per-CPU timer interrupt on SMP. + */ +void run_local_timers(void) +{ + hrtimer_run_queues(); + raise_softirq(TIMER_SOFTIRQ); + softlockup_tick(); +} + +/* + * Called by the timer interrupt. xtime_lock must already be taken + * by the timer IRQ! + */ +static inline void update_times(unsigned long ticks) +{ + update_wall_time(); + calc_load(ticks); +} + +/* + * The 64-bit jiffies value is not atomic - you MUST NOT read it + * without sampling the sequence number in xtime_lock. + * jiffies is defined in the linker script... + */ + +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + update_times(ticks); +} + +#ifdef __ARCH_WANT_SYS_ALARM + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +SYSCALL_DEFINE1(alarm, unsigned int, seconds) +{ + return alarm_setitimer(seconds); +} + +#endif + +#ifndef __alpha__ + +/* + * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this + * should be moved into arch/i386 instead? + */ + +/** + * sys_getpid - return the thread group id of the current process + * + * Note, despite the name, this returns the tgid not the pid. The tgid and + * the pid are identical unless CLONE_THREAD was specified on clone() in + * which case the tgid is the same in all threads of the same group. + * + * This is SMP safe as current->tgid does not change. + */ +SYSCALL_DEFINE0(getpid) +{ + return task_tgid_vnr(current); +} + +/* + * Accessing ->real_parent is not SMP-safe, it could + * change from under us. However, we can use a stale + * value of ->real_parent under rcu_read_lock(), see + * release_task()->call_rcu(delayed_put_task_struct). + */ +SYSCALL_DEFINE0(getppid) +{ + int pid; + + rcu_read_lock(); + pid = task_tgid_vnr(current->real_parent); + rcu_read_unlock(); + + return pid; +} + +SYSCALL_DEFINE0(getuid) +{ + /* Only we change this so SMP safe */ + return current_uid(); +} + +SYSCALL_DEFINE0(geteuid) +{ + /* Only we change this so SMP safe */ + return current_euid(); +} + +SYSCALL_DEFINE0(getgid) +{ + /* Only we change this so SMP safe */ + return current_gid(); +} + +SYSCALL_DEFINE0(getegid) +{ + /* Only we change this so SMP safe */ + return current_egid(); +} + +#endif + +static void process_timeout(unsigned long __data) +{ + wake_up_process((struct task_struct *)__data); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * In all cases the return value is guaranteed to be non-negative. + */ +signed long __sched schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx\n", timeout); + dump_stack(); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); + __mod_timer(&timer, expire); + schedule(); + del_singleshot_timer_sync(&timer); + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} +EXPORT_SYMBOL(schedule_timeout); + +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_killable(signed long timeout) +{ + __set_current_state(TASK_KILLABLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_killable); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + +/* Thread ID - the internal kernel "pid" */ +SYSCALL_DEFINE0(gettid) +{ + return task_pid_vnr(current); +} + +/** + * do_sysinfo - fill in sysinfo struct + * @info: pointer to buffer to fill + */ +int do_sysinfo(struct sysinfo *info) +{ + unsigned long mem_total, sav_total; + unsigned int mem_unit, bitcount; + unsigned long seq; + + memset(info, 0, sizeof(struct sysinfo)); + + do { + struct timespec tp; + seq = read_seqbegin(&xtime_lock); + + /* + * This is annoying. The below is the same thing + * posix_get_clock_monotonic() does, but it wants to + * take the lock which we want to cover the loads stuff + * too. + */ + + getnstimeofday(&tp); + tp.tv_sec += wall_to_monotonic.tv_sec; + tp.tv_nsec += wall_to_monotonic.tv_nsec; + monotonic_to_bootbased(&tp); + if (tp.tv_nsec - NSEC_PER_SEC >= 0) { + tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; + tp.tv_sec++; + } + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + + info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + + info->procs = nr_threads; + } while (read_seqretry(&xtime_lock, seq)); + + si_meminfo(info); + si_swapinfo(info); + + /* + * If the sum of all the available memory (i.e. ram + swap) + * is less than can be stored in a 32 bit unsigned long then + * we can be binary compatible with 2.2.x kernels. If not, + * well, in that case 2.2.x was broken anyways... + * + * -Erik Andersen <andersee@debian.org> + */ + + mem_total = info->totalram + info->totalswap; + if (mem_total < info->totalram || mem_total < info->totalswap) + goto out; + bitcount = 0; + mem_unit = info->mem_unit; + while (mem_unit > 1) { + bitcount++; + mem_unit >>= 1; + sav_total = mem_total; + mem_total <<= 1; + if (mem_total < sav_total) + goto out; + } + + /* + * If mem_total did not overflow, multiply all memory values by + * info->mem_unit and set it to 1. This leaves things compatible + * with 2.2.x, and also retains compatibility with earlier 2.4.x + * kernels... + */ + + info->mem_unit = 1; + info->totalram <<= bitcount; + info->freeram <<= bitcount; + info->sharedram <<= bitcount; + info->bufferram <<= bitcount; + info->totalswap <<= bitcount; + info->freeswap <<= bitcount; + info->totalhigh <<= bitcount; + info->freehigh <<= bitcount; + +out: + return 0; +} + +SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) +{ + struct sysinfo val; + + do_sysinfo(&val); + + if (copy_to_user(info, &val, sizeof(struct sysinfo))) + return -EFAULT; + + return 0; +} + +static int __cpuinit init_timers_cpu(int cpu) +{ + int j; + struct tvec_base *base; + static char __cpuinitdata tvec_base_done[NR_CPUS]; + + if (!tvec_base_done[cpu]) { + static char boot_done; + + if (boot_done) { + /* + * The APs use this path later in boot + */ + base = kmalloc_node(sizeof(*base), + GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); + if (!base) + return -ENOMEM; + + /* Make sure that tvec_base is 2 byte aligned */ + if (tbase_get_deferrable(base)) { + WARN_ON(1); + kfree(base); + return -ENOMEM; + } + per_cpu(tvec_bases, cpu) = base; + } else { + /* + * This is for the boot CPU - we use compile-time + * static initialisation because per-cpu memory isn't + * ready yet and because the memory allocators are not + * initialised either. + */ + boot_done = 1; + base = &boot_tvec_bases; + } + tvec_base_done[cpu] = 1; + } else { + base = per_cpu(tvec_bases, cpu); + } + + spin_lock_init(&base->lock); + + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD(base->tv5.vec + j); + INIT_LIST_HEAD(base->tv4.vec + j); + INIT_LIST_HEAD(base->tv3.vec + j); + INIT_LIST_HEAD(base->tv2.vec + j); + } + for (j = 0; j < TVR_SIZE; j++) + INIT_LIST_HEAD(base->tv1.vec + j); + + base->timer_jiffies = jiffies; + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) +{ + struct timer_list *timer; + + while (!list_empty(head)) { + timer = list_first_entry(head, struct timer_list, entry); + detach_timer(timer, 0); + timer_set_base(timer, new_base); + internal_add_timer(new_base, timer); + } +} + +static void __cpuinit migrate_timers(int cpu) +{ + struct tvec_base *old_base; + struct tvec_base *new_base; + int i; + + BUG_ON(cpu_online(cpu)); + old_base = per_cpu(tvec_bases, cpu); + new_base = get_cpu_var(tvec_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + BUG_ON(old_base->running_timer); + + for (i = 0; i < TVR_SIZE; i++) + migrate_timer_list(new_base, old_base->tv1.vec + i); + for (i = 0; i < TVN_SIZE; i++) { + migrate_timer_list(new_base, old_base->tv2.vec + i); + migrate_timer_list(new_base, old_base->tv3.vec + i); + migrate_timer_list(new_base, old_base->tv4.vec + i); + migrate_timer_list(new_base, old_base->tv5.vec + i); + } + + spin_unlock(&old_base->lock); + spin_unlock_irq(&new_base->lock); + put_cpu_var(tvec_bases); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __cpuinit timer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + switch(action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (init_timers_cpu(cpu) < 0) + return NOTIFY_BAD; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + migrate_timers(cpu); + break; +#endif + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata timers_nb = { + .notifier_call = timer_cpu_notify, +}; + + +void __init init_timers(void) +{ + int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + + init_timer_stats(); + + BUG_ON(err == NOTIFY_BAD); + register_cpu_notifier(&timers_nb); + open_softirq(TIMER_SOFTIRQ, run_timer_softirq); +} + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +} + +EXPORT_SYMBOL(msleep); +#endif /* DDE */ + +/** + * msleep_interruptible - sleep waiting for signals + * @msecs: Time in milliseconds to sleep for + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); + return jiffies_to_msecs(timeout); +} + +EXPORT_SYMBOL(msleep_interruptible); diff --git a/libdde-linux26/lib/src/kernel/wait.c b/libdde-linux26/lib/src/kernel/wait.c new file mode 100644 index 00000000..b10d867f --- /dev/null +++ b/libdde-linux26/lib/src/kernel/wait.c @@ -0,0 +1,301 @@ +/* + * Generic waiting primitives. + * + * (C) 2004 William Irwin, Oracle + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/wait.h> +#include <linux/hash.h> + +#ifdef DDE_LINUX +#include "local.h" +#endif + +void init_waitqueue_head(wait_queue_head_t *q) +{ + spin_lock_init(&q->lock); + INIT_LIST_HEAD(&q->task_list); +} + +EXPORT_SYMBOL(init_waitqueue_head); + +void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue); + +void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue_tail(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive); + +void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __remove_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(remove_wait_queue); + + +/* + * Note: we use "set_current_state()" _after_ the wait-queue add, + * because we need a memory barrier there on SMP, so that any + * wake-function that tests for the wait-queue being active + * will be guaranteed to see waitqueue addition _or_ subsequent + * tests in this thread will see the wakeup having taken place. + * + * The spin_unlock() itself is semi-permeable and only protects + * one way (it only protects stuff inside the critical region and + * stops them from bleeding out - it would still allow subsequent + * loads to move into the critical region). + */ +void +prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait); + +void +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive); + +/* + * finish_wait - clean up after waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + */ +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + /* + * We can check for list emptiness outside the lock + * IFF: + * - we use the "careful" check that verifies both + * the next and prev pointers, so that there cannot + * be any half-pending updates in progress on other + * CPU's that we haven't seen yet (and that might + * still change the stack area. + * and + * - all other users take the lock (ie we can only + * have _one_ other CPU that looks at or modifies + * the list). + */ + if (!list_empty_careful(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_wait); + +/* + * abort_exclusive_wait - abort exclusive waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * @state: runstate of the waiter to be woken + * @key: key to identify a wait bit queue or %NULL + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + * + * Wakes up the next waiter if the caller is concurrently + * woken up through the queue. + * + * This prevents waiter starvation where an exclusive waiter + * aborts and is woken up concurrently and noone wakes up + * the next waiter. + */ +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, + unsigned int mode, void *key) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); + else if (waitqueue_active(q)) + __wake_up_common(q, mode, 1, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(abort_exclusive_wait); + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + + if (ret) + list_del_init(&wait->task_list); + return ret; +} +EXPORT_SYMBOL(autoremove_wake_function); + +int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + test_bit(key->bit_nr, key->flags)) + return 0; + else + return autoremove_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched +__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + if (test_bit(q->key.bit_nr, q->key.flags)) + ret = (*action)(q->key.flags); + } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); + finish_wait(wq, &q->wait); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched out_of_line_wait_on_bit(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched +__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + do { + int ret; + + prepare_to_wait_exclusive(wq, &q->wait, mode); + if (!test_bit(q->key.bit_nr, q->key.flags)) + continue; + ret = action(q->key.flags); + if (!ret) + continue; + abort_exclusive_wait(wq, &q->wait, mode, &q->key); + return ret; + } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); + finish_wait(wq, &q->wait); + return 0; +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit_lock(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) +{ +#ifndef DDE_LINUX + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &key); +#else + WARN_UNIMPL; +#endif +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_clear_bit(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void wake_up_bit(void *word, int bit) +{ + __wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ +#ifndef DDE_LINUX + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + const struct zone *zone = page_zone(virt_to_page(word)); + unsigned long val = (unsigned long)word << shift | bit; + + return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; +#else + WARN_UNIMPL; + return NULL; +#endif +} +EXPORT_SYMBOL(bit_waitqueue); diff --git a/libdde-linux26/lib/src/kernel/workqueue.c b/libdde-linux26/lib/src/kernel/workqueue.c new file mode 100644 index 00000000..5ad26d9f --- /dev/null +++ b/libdde-linux26/lib/src/kernel/workqueue.c @@ -0,0 +1,1038 @@ +/* + * linux/kernel/workqueue.c + * + * Generic mechanism for defining kernel helper threads for running + * arbitrary tasks in process context. + * + * Started by Ingo Molnar, Copyright (C) 2002 + * + * Derived from the taskqueue/keventd code by: + * + * David Woodhouse <dwmw2@infradead.org> + * Andrew Morton + * Kai Petzke <wpp@marie.physik.tu-berlin.de> + * Theodore Ts'o <tytso@mit.edu> + * + * Made to use alloc_percpu by Christoph Lameter. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/signal.h> +#include <linux/completion.h> +#include <linux/workqueue.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/notifier.h> +#include <linux/kthread.h> +#include <linux/hardirq.h> +#include <linux/mempolicy.h> +#include <linux/freezer.h> +#include <linux/kallsyms.h> +#include <linux/debug_locks.h> +#include <linux/lockdep.h> + +#ifdef DDE_LINUX +#include "local.h" +#endif + +/* + * The per-CPU workqueue (if single thread, we always use the first + * possible cpu). + */ +struct cpu_workqueue_struct { + + spinlock_t lock; + + struct list_head worklist; + wait_queue_head_t more_work; + struct work_struct *current_work; + + struct workqueue_struct *wq; + struct task_struct *thread; + + int run_depth; /* Detect run_workqueue() recursion depth */ +} ____cacheline_aligned; + +/* + * The externally visible workqueue abstraction is an array of + * per-CPU workqueues: + */ +struct workqueue_struct { + struct cpu_workqueue_struct *cpu_wq; + struct list_head list; + const char *name; + int singlethread; + int freezeable; /* Freeze threads during suspend */ + int rt; +#ifdef CONFIG_LOCKDEP + struct lockdep_map lockdep_map; +#endif +}; + +/* Serializes the accesses to the list of workqueues. */ +static DEFINE_SPINLOCK(workqueue_lock); +static LIST_HEAD(workqueues); + +static int singlethread_cpu __read_mostly; +static const struct cpumask *cpu_singlethread_map __read_mostly; +/* + * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD + * flushes cwq->worklist. This means that flush_workqueue/wait_on_work + * which comes in between can't use for_each_online_cpu(). We could + * use cpu_possible_map, the cpumask below is more a documentation + * than optimization. + */ +static cpumask_var_t cpu_populated_map __read_mostly; + +/* If it's single threaded, it isn't in the list of workqueues. */ +static inline int is_wq_single_threaded(struct workqueue_struct *wq) +{ + return wq->singlethread; +} + +static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) +{ + return is_wq_single_threaded(wq) + ? cpu_singlethread_map : cpu_populated_map; +} + +static +struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) +{ + if (unlikely(is_wq_single_threaded(wq))) + cpu = singlethread_cpu; + return per_cpu_ptr(wq->cpu_wq, cpu); +} + +/* + * Set the workqueue on which a work item is to be run + * - Must *only* be called if the pending flag is set + */ +static inline void set_wq_data(struct work_struct *work, + struct cpu_workqueue_struct *cwq) +{ + unsigned long new; + + BUG_ON(!work_pending(work)); + + new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); + new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); + atomic_long_set(&work->data, new); +} + +static inline +struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) +{ + return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); +} + +static void insert_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work, struct list_head *head) +{ + set_wq_data(work, cwq); + /* + * Ensure that we get the right work->data if we see the + * result of list_add() below, see try_to_grab_pending(). + */ + smp_wmb(); + list_add_tail(&work->entry, head); + wake_up(&cwq->more_work); +} + +static void __queue_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work) +{ + unsigned long flags; + + spin_lock_irqsave(&cwq->lock, flags); + insert_work(cwq, work, &cwq->worklist); + spin_unlock_irqrestore(&cwq->lock, flags); +} + +/** + * queue_work - queue work on a workqueue + * @wq: workqueue to use + * @work: work to queue + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. + */ +int queue_work(struct workqueue_struct *wq, struct work_struct *work) +{ + int ret; + + ret = queue_work_on(get_cpu(), wq, work); + put_cpu(); + + return ret; +} +EXPORT_SYMBOL_GPL(queue_work); + +/** + * queue_work_on - queue work on specific cpu + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to a specific CPU, the caller must ensure it + * can't go away. + */ +int +queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) +{ + int ret = 0; + + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { + BUG_ON(!list_empty(&work->entry)); + __queue_work(wq_per_cpu(wq, cpu), work); + ret = 1; + } + return ret; +} +EXPORT_SYMBOL_GPL(queue_work_on); + +static void delayed_work_timer_fn(unsigned long __data) +{ + struct delayed_work *dwork = (struct delayed_work *)__data; + struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); + struct workqueue_struct *wq = cwq->wq; + + __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); +} + +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @dwork: delayable work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + */ +int queue_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + if (delay == 0) + return queue_work(wq, &dwork->work); + + return queue_delayed_work_on(-1, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(queue_delayed_work); + +/** + * queue_delayed_work_on - queue work on specific CPU after delay + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + */ +int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + int ret = 0; + struct timer_list *timer = &dwork->timer; + struct work_struct *work = &dwork->work; + + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { + BUG_ON(timer_pending(timer)); + BUG_ON(!list_empty(&work->entry)); + + timer_stats_timer_set_start_info(&dwork->timer); + + /* This stores cwq for the moment, for the timer_fn */ + set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); + timer->expires = jiffies + delay; + timer->data = (unsigned long)dwork; + timer->function = delayed_work_timer_fn; + + if (unlikely(cpu >= 0)) + add_timer_on(timer, cpu); + else + add_timer(timer); + ret = 1; + } + return ret; +} +EXPORT_SYMBOL_GPL(queue_delayed_work_on); + +static void run_workqueue(struct cpu_workqueue_struct *cwq) +{ + spin_lock_irq(&cwq->lock); + cwq->run_depth++; + if (cwq->run_depth > 3) { + /* morton gets to eat his hat */ + printk("%s: recursion depth exceeded: %d\n", + __func__, cwq->run_depth); + dump_stack(); + } + while (!list_empty(&cwq->worklist)) { + struct work_struct *work = list_entry(cwq->worklist.next, + struct work_struct, entry); + work_func_t f = work->func; +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the struct work_struct + * from inside the function that is called from it, + * this we need to take into account for lockdep too. + * To avoid bogus "held lock freed" warnings as well + * as problems when looking into work->lockdep_map, + * make a copy and use that here. + */ + struct lockdep_map lockdep_map = work->lockdep_map; +#endif + + cwq->current_work = work; + list_del_init(cwq->worklist.next); + spin_unlock_irq(&cwq->lock); + + BUG_ON(get_wq_data(work) != cwq); + work_clear_pending(work); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); + f(work); + lock_map_release(&lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), + task_pid_nr(current)); +#ifndef DDE_LINUX + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)f); + debug_show_held_locks(current); + dump_stack(); +#endif /* DDE_LINUX */ + } + + spin_lock_irq(&cwq->lock); + cwq->current_work = NULL; + } + cwq->run_depth--; + spin_unlock_irq(&cwq->lock); +} + +static int worker_thread(void *__cwq) +{ + struct cpu_workqueue_struct *cwq = __cwq; + DEFINE_WAIT(wait); + + if (cwq->wq->freezeable) + set_freezable(); + + set_user_nice(current, -5); + + for (;;) { + prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); + if (!freezing(current) && + !kthread_should_stop() && + list_empty(&cwq->worklist)) + schedule(); + finish_wait(&cwq->more_work, &wait); + + try_to_freeze(); + + if (kthread_should_stop()) + break; + + run_workqueue(cwq); + } + + return 0; +} + +struct wq_barrier { + struct work_struct work; + struct completion done; +}; + +static void wq_barrier_func(struct work_struct *work) +{ + struct wq_barrier *barr = container_of(work, struct wq_barrier, work); + complete(&barr->done); +} + +static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, + struct wq_barrier *barr, struct list_head *head) +{ + INIT_WORK(&barr->work, wq_barrier_func); + __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); + + init_completion(&barr->done); + + insert_work(cwq, &barr->work, head); +} + +static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) +{ + int active; + + if (cwq->thread == current) { + /* + * Probably keventd trying to flush its own queue. So simply run + * it by hand rather than deadlocking. + */ + run_workqueue(cwq); + active = 1; + } else { + struct wq_barrier barr; + + active = 0; + spin_lock_irq(&cwq->lock); + if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { + insert_wq_barrier(cwq, &barr, &cwq->worklist); + active = 1; + } + spin_unlock_irq(&cwq->lock); + + if (active) + wait_for_completion(&barr.done); + } + + return active; +} + +/** + * flush_workqueue - ensure that any scheduled work has run to completion. + * @wq: workqueue to flush + * + * Forces execution of the workqueue and blocks until its completion. + * This is typically used in driver shutdown handlers. + * + * We sleep until all works which were queued on entry have been handled, + * but we are not livelocked by new incoming ones. + * + * This function used to run the workqueues itself. Now we just wait for the + * helper threads to do it. + */ +void flush_workqueue(struct workqueue_struct *wq) +{ + const struct cpumask *cpu_map = wq_cpu_map(wq); + int cpu; + + might_sleep(); + lock_map_acquire(&wq->lockdep_map); + lock_map_release(&wq->lockdep_map); + for_each_cpu_mask_nr(cpu, *cpu_map) + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); +} +EXPORT_SYMBOL_GPL(flush_workqueue); + +/** + * flush_work - block until a work_struct's callback has terminated + * @work: the work which is to be flushed + * + * Returns false if @work has already terminated. + * + * It is expected that, prior to calling flush_work(), the caller has + * arranged for the work to not be requeued, otherwise it doesn't make + * sense to use this function. + */ +int flush_work(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq; + struct list_head *prev; + struct wq_barrier barr; + + might_sleep(); + cwq = get_wq_data(work); + if (!cwq) + return 0; + + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + prev = NULL; + spin_lock_irq(&cwq->lock); + if (!list_empty(&work->entry)) { + /* + * See the comment near try_to_grab_pending()->smp_rmb(). + * If it was re-queued under us we are not going to wait. + */ + smp_rmb(); + if (unlikely(cwq != get_wq_data(work))) + goto out; + prev = &work->entry; + } else { + if (cwq->current_work != work) + goto out; + prev = &cwq->worklist; + } + insert_wq_barrier(cwq, &barr, prev->next); +out: + spin_unlock_irq(&cwq->lock); + if (!prev) + return 0; + + wait_for_completion(&barr.done); + return 1; +} +EXPORT_SYMBOL_GPL(flush_work); + +/* + * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, + * so this work can't be re-armed in any way. + */ +static int try_to_grab_pending(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq; + int ret = -1; + + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) + return 0; + + /* + * The queueing is in progress, or it is already queued. Try to + * steal it from ->worklist without clearing WORK_STRUCT_PENDING. + */ + + cwq = get_wq_data(work); + if (!cwq) + return ret; + + spin_lock_irq(&cwq->lock); + if (!list_empty(&work->entry)) { + /* + * This work is queued, but perhaps we locked the wrong cwq. + * In that case we must see the new value after rmb(), see + * insert_work()->wmb(). + */ + smp_rmb(); + if (cwq == get_wq_data(work)) { + list_del_init(&work->entry); + ret = 1; + } + } + spin_unlock_irq(&cwq->lock); + + return ret; +} + +static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work) +{ + struct wq_barrier barr; + int running = 0; + + spin_lock_irq(&cwq->lock); + if (unlikely(cwq->current_work == work)) { + insert_wq_barrier(cwq, &barr, cwq->worklist.next); + running = 1; + } + spin_unlock_irq(&cwq->lock); + + if (unlikely(running)) + wait_for_completion(&barr.done); +} + +static void wait_on_work(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq; + struct workqueue_struct *wq; + const struct cpumask *cpu_map; + int cpu; + + might_sleep(); + + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + + cwq = get_wq_data(work); + if (!cwq) + return; + + wq = cwq->wq; + cpu_map = wq_cpu_map(wq); + + for_each_cpu_mask_nr(cpu, *cpu_map) + wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); +} + +static int __cancel_work_timer(struct work_struct *work, + struct timer_list* timer) +{ + int ret; + + do { + ret = (timer && likely(del_timer(timer))); + if (!ret) + ret = try_to_grab_pending(work); + wait_on_work(work); + } while (unlikely(ret < 0)); + + work_clear_pending(work); + return ret; +} + +/** + * cancel_work_sync - block until a work_struct's callback has terminated + * @work: the work which is to be flushed + * + * Returns true if @work was pending. + * + * cancel_work_sync() will cancel the work if it is queued. If the work's + * callback appears to be running, cancel_work_sync() will block until it + * has completed. + * + * It is possible to use this function if the work re-queues itself. It can + * cancel the work even if it migrates to another workqueue, however in that + * case it only guarantees that work->func() has completed on the last queued + * workqueue. + * + * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not + * pending, otherwise it goes into a busy-wait loop until the timer expires. + * + * The caller must ensure that workqueue_struct on which this work was last + * queued can't be destroyed before this function returns. + */ +int cancel_work_sync(struct work_struct *work) +{ + return __cancel_work_timer(work, NULL); +} +EXPORT_SYMBOL_GPL(cancel_work_sync); + +/** + * cancel_delayed_work_sync - reliably kill off a delayed work. + * @dwork: the delayed work struct + * + * Returns true if @dwork was pending. + * + * It is possible to use this function if @dwork rearms itself via queue_work() + * or queue_delayed_work(). See also the comment for cancel_work_sync(). + */ +int cancel_delayed_work_sync(struct delayed_work *dwork) +{ + return __cancel_work_timer(&dwork->work, &dwork->timer); +} +EXPORT_SYMBOL(cancel_delayed_work_sync); + +static struct workqueue_struct *keventd_wq __read_mostly; + +/** + * schedule_work - put work task in global workqueue + * @work: job to be done + * + * This puts a job in the kernel-global workqueue. + */ +int schedule_work(struct work_struct *work) +{ + return queue_work(keventd_wq, work); +} +EXPORT_SYMBOL(schedule_work); + +/* + * schedule_work_on - put work task on a specific cpu + * @cpu: cpu to put the work task on + * @work: job to be done + * + * This puts a job on a specific cpu + */ +int schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, keventd_wq, work); +} +EXPORT_SYMBOL(schedule_work_on); + +/** + * schedule_delayed_work - put work task in global workqueue after delay + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ +int schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work(keventd_wq, dwork, delay); +} +EXPORT_SYMBOL(schedule_delayed_work); + +/** + * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * @cpu: cpu to use + * @dwork: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue on the specified CPU. + */ +int schedule_delayed_work_on(int cpu, + struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); +} +EXPORT_SYMBOL(schedule_delayed_work_on); + +/** + * schedule_on_each_cpu - call a function on each online CPU from keventd + * @func: the function to call + * + * Returns zero on success. + * Returns -ve errno on failure. + * + * schedule_on_each_cpu() is very slow. + */ +int schedule_on_each_cpu(work_func_t func) +{ + int cpu; + struct work_struct *works; + + works = alloc_percpu(struct work_struct); + if (!works) + return -ENOMEM; + + get_online_cpus(); + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, func); + schedule_work_on(cpu, work); + } + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(works, cpu)); + put_online_cpus(); + free_percpu(works); + return 0; +} + +void flush_scheduled_work(void) +{ + flush_workqueue(keventd_wq); +} +EXPORT_SYMBOL(flush_scheduled_work); + +/** + * execute_in_process_context - reliably execute the routine with user context + * @fn: the function to execute + * @ew: guaranteed storage for the execute work structure (must + * be available when the work executes) + * + * Executes the function immediately if process context is available, + * otherwise schedules the function for delayed execution. + * + * Returns: 0 - function was executed + * 1 - function was scheduled for execution + */ +int execute_in_process_context(work_func_t fn, struct execute_work *ew) +{ + if (!in_interrupt()) { + fn(&ew->work); + return 0; + } + + INIT_WORK(&ew->work, fn); + schedule_work(&ew->work); + + return 1; +} +EXPORT_SYMBOL_GPL(execute_in_process_context); + +int keventd_up(void) +{ + return keventd_wq != NULL; +} + +int current_is_keventd(void) +{ + struct cpu_workqueue_struct *cwq; + int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ + int ret = 0; + + BUG_ON(!keventd_wq); + + cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); + if (current == cwq->thread) + ret = 1; + + return ret; + +} + +static struct cpu_workqueue_struct * +init_cpu_workqueue(struct workqueue_struct *wq, int cpu) +{ + struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); + + cwq->wq = wq; + spin_lock_init(&cwq->lock); + INIT_LIST_HEAD(&cwq->worklist); + init_waitqueue_head(&cwq->more_work); + + return cwq; +} + +static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct workqueue_struct *wq = cwq->wq; + const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; + struct task_struct *p; + + p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); + /* + * Nobody can add the work_struct to this cwq, + * if (caller is __create_workqueue) + * nobody should see this wq + * else // caller is CPU_UP_PREPARE + * cpu is not on cpu_online_map + * so we can abort safely. + */ + if (IS_ERR(p)) + return PTR_ERR(p); + if (cwq->wq->rt) + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); + cwq->thread = p; + + return 0; +} + +static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ + struct task_struct *p = cwq->thread; + + if (p != NULL) { + if (cpu >= 0) + kthread_bind(p, cpu); + wake_up_process(p); + } +} + +struct workqueue_struct *__create_workqueue_key(const char *name, + int singlethread, + int freezeable, + int rt, + struct lock_class_key *key, + const char *lock_name) +{ + struct workqueue_struct *wq; + struct cpu_workqueue_struct *cwq; + int err = 0, cpu; + + wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) + return NULL; + + wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); + if (!wq->cpu_wq) { + kfree(wq); + return NULL; + } + + wq->name = name; + lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); + wq->singlethread = singlethread; + wq->freezeable = freezeable; + wq->rt = rt; + INIT_LIST_HEAD(&wq->list); + + if (singlethread) { + cwq = init_cpu_workqueue(wq, singlethread_cpu); + err = create_workqueue_thread(cwq, singlethread_cpu); + start_workqueue_thread(cwq, -1); + } else { + cpu_maps_update_begin(); + /* + * We must place this wq on list even if the code below fails. + * cpu_down(cpu) can remove cpu from cpu_populated_map before + * destroy_workqueue() takes the lock, in that case we leak + * cwq[cpu]->thread. + */ + spin_lock(&workqueue_lock); + list_add(&wq->list, &workqueues); + spin_unlock(&workqueue_lock); + /* + * We must initialize cwqs for each possible cpu even if we + * are going to call destroy_workqueue() finally. Otherwise + * cpu_up() can hit the uninitialized cwq once we drop the + * lock. + */ + for_each_possible_cpu(cpu) { + cwq = init_cpu_workqueue(wq, cpu); + if (err || !cpu_online(cpu)) + continue; + err = create_workqueue_thread(cwq, cpu); + start_workqueue_thread(cwq, cpu); + } + cpu_maps_update_done(); + } + + if (err) { + destroy_workqueue(wq); + wq = NULL; + } + return wq; +} +EXPORT_SYMBOL_GPL(__create_workqueue_key); + +static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) +{ + /* + * Our caller is either destroy_workqueue() or CPU_POST_DEAD, + * cpu_add_remove_lock protects cwq->thread. + */ + if (cwq->thread == NULL) + return; + + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + flush_cpu_workqueue(cwq); + /* + * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, + * a concurrent flush_workqueue() can insert a barrier after us. + * However, in that case run_workqueue() won't return and check + * kthread_should_stop() until it flushes all work_struct's. + * When ->worklist becomes empty it is safe to exit because no + * more work_structs can be queued on this cwq: flush_workqueue + * checks list_empty(), and a "normal" queue_work() can't use + * a dead CPU. + */ + kthread_stop(cwq->thread); + cwq->thread = NULL; +} + +/** + * destroy_workqueue - safely terminate a workqueue + * @wq: target workqueue + * + * Safely destroy a workqueue. All work currently pending will be done first. + */ +void destroy_workqueue(struct workqueue_struct *wq) +{ + const struct cpumask *cpu_map = wq_cpu_map(wq); + int cpu; + + cpu_maps_update_begin(); + spin_lock(&workqueue_lock); + list_del(&wq->list); + spin_unlock(&workqueue_lock); + + for_each_cpu_mask_nr(cpu, *cpu_map) + cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); + cpu_maps_update_done(); + + free_percpu(wq->cpu_wq); + kfree(wq); +} +EXPORT_SYMBOL_GPL(destroy_workqueue); + +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct cpu_workqueue_struct *cwq; + struct workqueue_struct *wq; + int ret = NOTIFY_OK; + + action &= ~CPU_TASKS_FROZEN; + + switch (action) { + case CPU_UP_PREPARE: + cpumask_set_cpu(cpu, cpu_populated_map); + } +undo: + list_for_each_entry(wq, &workqueues, list) { + cwq = per_cpu_ptr(wq->cpu_wq, cpu); + + switch (action) { + case CPU_UP_PREPARE: + if (!create_workqueue_thread(cwq, cpu)) + break; + printk(KERN_ERR "workqueue [%s] for %i failed\n", + wq->name, cpu); + action = CPU_UP_CANCELED; + ret = NOTIFY_BAD; + goto undo; + + case CPU_ONLINE: + start_workqueue_thread(cwq, cpu); + break; + + case CPU_UP_CANCELED: + start_workqueue_thread(cwq, -1); + case CPU_POST_DEAD: + cleanup_workqueue_thread(cwq); + break; + } + } + + switch (action) { + case CPU_UP_CANCELED: + case CPU_POST_DEAD: + cpumask_clear_cpu(cpu, cpu_populated_map); + } + + return ret; +} + +#ifdef CONFIG_SMP +static struct workqueue_struct *work_on_cpu_wq __read_mostly; + +struct work_for_cpu { + struct work_struct work; + long (*fn)(void *); + void *arg; + long ret; +}; + +static void do_work_for_cpu(struct work_struct *w) +{ + struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work); + + wfc->ret = wfc->fn(wfc->arg); +} + +/** + * work_on_cpu - run a function in user context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function arg + * + * This will return the value @fn returns. + * It is up to the caller to ensure that the cpu doesn't go offline. + */ +long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +{ + struct work_for_cpu wfc; + + INIT_WORK(&wfc.work, do_work_for_cpu); + wfc.fn = fn; + wfc.arg = arg; + queue_work_on(cpu, work_on_cpu_wq, &wfc.work); + flush_work(&wfc.work); + + return wfc.ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu); +#endif /* CONFIG_SMP */ + +void __init init_workqueues(void) +{ + alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); + + cpumask_copy(cpu_populated_map, cpu_online_mask); + singlethread_cpu = cpumask_first(cpu_possible_mask); + cpu_singlethread_map = cpumask_of(singlethread_cpu); + hotcpu_notifier(workqueue_cpu_callback, 0); + keventd_wq = create_workqueue("events"); + BUG_ON(!keventd_wq); +#ifdef CONFIG_SMP + work_on_cpu_wq = create_workqueue("work_on_cpu"); + BUG_ON(!work_on_cpu_wq); +#endif +} + +#ifdef DDE_LINUX +core_initcall(init_workqueues); +#endif diff --git a/libdde-linux26/lib/src/lib/iomap.c b/libdde-linux26/lib/src/lib/iomap.c new file mode 100644 index 00000000..d90ac2aa --- /dev/null +++ b/libdde-linux26/lib/src/lib/iomap.c @@ -0,0 +1,301 @@ +/* + * Implement the default iomap interfaces + * + * (C) Copyright 2004 Linus Torvalds + */ +#include <linux/pci.h> +#include <linux/io.h> + +#include <linux/module.h> + +#ifdef DDE_LINUX +#include "local.h" +#endif + +/* + * Read/write from/to an (offsettable) iomem cookie. It might be a PIO + * access or a MMIO access, these functions don't care. The info is + * encoded in the hardware mapping set up by the mapping functions + * (or the cookie itself, depending on implementation and hw). + * + * The generic routines don't assume any hardware mappings, and just + * encode the PIO/MMIO as part of the cookie. They coldly assume that + * the MMIO IO mappings are not in the low address range. + * + * Architectures for which this is not true can't use this generic + * implementation and should do their own copy. + */ + +#ifndef HAVE_ARCH_PIO_SIZE +/* + * We encode the physical PIO addresses (0-0xffff) into the + * pointer by offsetting them with a constant (0x10000) and + * assuming that all the low addresses are always PIO. That means + * we can do some sanity checks on the low bits, and don't + * need to just take things for granted. + */ +#define PIO_OFFSET 0x10000UL +#define PIO_MASK 0x0ffffUL +#define PIO_RESERVED 0x40000UL +#endif + +static void bad_io_access(unsigned long port, const char *access) +{ + static int count = 10; + if (count) { + count--; + WARN(1, KERN_ERR "Bad IO access at port %#lx (%s)\n", port, access); + } +} + +/* + * Ugly macros are a way of life. + */ +#ifdef DDE_LINUX +/* DDE_LINUX maps io ports to [0xf0000000, 0xf0040000], so we also need + * to check the lower bounds of port addresses. + */ +#define IO_COND(addr, is_pio, is_mmio) do { \ + unsigned long port = (unsigned long __force)addr; \ + if (port > PIO_OFFSET && port < PIO_RESERVED) { \ + port &= PIO_MASK; \ + is_pio; \ + } else { \ + is_mmio; \ + } \ +} while (0) +#else +#define IO_COND(addr, is_pio, is_mmio) do { \ + unsigned long port = (unsigned long __force)addr; \ + if (port >= PIO_RESERVED) { \ + is_mmio; \ + } else if (port > PIO_OFFSET) { \ + port &= PIO_MASK; \ + is_pio; \ + } else \ + bad_io_access(port, #is_pio ); \ +} while (0) +#endif + +#ifndef pio_read16be +#define pio_read16be(port) swab16(inw(port)) +#define pio_read32be(port) swab32(inl(port)) +#endif + +#ifndef mmio_read16be +#define mmio_read16be(addr) be16_to_cpu(__raw_readw(addr)) +#define mmio_read32be(addr) be32_to_cpu(__raw_readl(addr)) +#endif + +unsigned int ioread8(void __iomem *addr) +{ + IO_COND(addr, return inb(port), return readb(addr)); + return 0xff; +} +unsigned int ioread16(void __iomem *addr) +{ + IO_COND(addr, return inw(port), return readw(addr)); + return 0xffff; +} +unsigned int ioread16be(void __iomem *addr) +{ + IO_COND(addr, return pio_read16be(port), return mmio_read16be(addr)); + return 0xffff; +} +unsigned int ioread32(void __iomem *addr) +{ + IO_COND(addr, return inl(port), return readl(addr)); + return 0xffffffff; +} +unsigned int ioread32be(void __iomem *addr) +{ + IO_COND(addr, return pio_read32be(port), return mmio_read32be(addr)); + return 0xffffffff; +} +EXPORT_SYMBOL(ioread8); +EXPORT_SYMBOL(ioread16); +EXPORT_SYMBOL(ioread16be); +EXPORT_SYMBOL(ioread32); +EXPORT_SYMBOL(ioread32be); + +#ifndef pio_write16be +#define pio_write16be(val,port) outw(swab16(val),port) +#define pio_write32be(val,port) outl(swab32(val),port) +#endif + +#ifndef mmio_write16be +#define mmio_write16be(val,port) __raw_writew(be16_to_cpu(val),port) +#define mmio_write32be(val,port) __raw_writel(be32_to_cpu(val),port) +#endif + +void iowrite8(u8 val, void __iomem *addr) +{ + IO_COND(addr, outb(val,port), writeb(val, addr)); +} +void iowrite16(u16 val, void __iomem *addr) +{ + IO_COND(addr, outw(val,port), writew(val, addr)); +} +void iowrite16be(u16 val, void __iomem *addr) +{ + IO_COND(addr, pio_write16be(val,port), mmio_write16be(val, addr)); +} +void iowrite32(u32 val, void __iomem *addr) +{ + IO_COND(addr, outl(val,port), writel(val, addr)); +} +void iowrite32be(u32 val, void __iomem *addr) +{ + IO_COND(addr, pio_write32be(val,port), mmio_write32be(val, addr)); +} +EXPORT_SYMBOL(iowrite8); +EXPORT_SYMBOL(iowrite16); +EXPORT_SYMBOL(iowrite16be); +EXPORT_SYMBOL(iowrite32); +EXPORT_SYMBOL(iowrite32be); + +/* + * These are the "repeat MMIO read/write" functions. + * Note the "__raw" accesses, since we don't want to + * convert to CPU byte order. We write in "IO byte + * order" (we also don't have IO barriers). + */ +#ifndef mmio_insb +static inline void mmio_insb(void __iomem *addr, u8 *dst, int count) +{ + while (--count >= 0) { + u8 data = __raw_readb(addr); + *dst = data; + dst++; + } +} +static inline void mmio_insw(void __iomem *addr, u16 *dst, int count) +{ + while (--count >= 0) { + u16 data = __raw_readw(addr); + *dst = data; + dst++; + } +} +static inline void mmio_insl(void __iomem *addr, u32 *dst, int count) +{ + while (--count >= 0) { + u32 data = __raw_readl(addr); + *dst = data; + dst++; + } +} +#endif + +#ifndef mmio_outsb +static inline void mmio_outsb(void __iomem *addr, const u8 *src, int count) +{ + while (--count >= 0) { + __raw_writeb(*src, addr); + src++; + } +} +static inline void mmio_outsw(void __iomem *addr, const u16 *src, int count) +{ + while (--count >= 0) { + __raw_writew(*src, addr); + src++; + } +} +static inline void mmio_outsl(void __iomem *addr, const u32 *src, int count) +{ + while (--count >= 0) { + __raw_writel(*src, addr); + src++; + } +} +#endif + +void ioread8_rep(void __iomem *addr, void *dst, unsigned long count) +{ + IO_COND(addr, insb(port,dst,count), mmio_insb(addr, dst, count)); +} +void ioread16_rep(void __iomem *addr, void *dst, unsigned long count) +{ + IO_COND(addr, insw(port,dst,count), mmio_insw(addr, dst, count)); +} +void ioread32_rep(void __iomem *addr, void *dst, unsigned long count) +{ + IO_COND(addr, insl(port,dst,count), mmio_insl(addr, dst, count)); +} +EXPORT_SYMBOL(ioread8_rep); +EXPORT_SYMBOL(ioread16_rep); +EXPORT_SYMBOL(ioread32_rep); + +void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) +{ + IO_COND(addr, outsb(port, src, count), mmio_outsb(addr, src, count)); +} +void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) +{ + IO_COND(addr, outsw(port, src, count), mmio_outsw(addr, src, count)); +} +void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) +{ + IO_COND(addr, outsl(port, src,count), mmio_outsl(addr, src, count)); +} +EXPORT_SYMBOL(iowrite8_rep); +EXPORT_SYMBOL(iowrite16_rep); +EXPORT_SYMBOL(iowrite32_rep); + +/* Create a virtual mapping cookie for an IO port range */ +void __iomem *ioport_map(unsigned long port, unsigned int nr) +{ + if (port > PIO_MASK) + return NULL; + return (void __iomem *) (unsigned long) (port + PIO_OFFSET); +} + +void ioport_unmap(void __iomem *addr) +{ + /* Nothing to do */ +} +EXPORT_SYMBOL(ioport_map); +EXPORT_SYMBOL(ioport_unmap); + +/** + * pci_iomap - create a virtual mapping cookie for a PCI BAR + * @dev: PCI device that owns the BAR + * @bar: BAR number + * @maxlen: length of the memory to map + * + * Using this function you will get a __iomem address to your device BAR. + * You can access it using ioread*() and iowrite*(). These functions hide + * the details if this is a MMIO or PIO address space and will just do what + * you expect from them in the correct way. + * + * @maxlen specifies the maximum length to map. If you want to get access to + * the complete BAR without checking for its length first, pass %0 here. + * */ +void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) +{ + resource_size_t start = pci_resource_start(dev, bar); + resource_size_t len = pci_resource_len(dev, bar); + unsigned long flags = pci_resource_flags(dev, bar); + + if (!len || !start) + return NULL; + if (maxlen && len > maxlen) + len = maxlen; + if (flags & IORESOURCE_IO) + return ioport_map(start, len); + if (flags & IORESOURCE_MEM) { + if (flags & IORESOURCE_CACHEABLE) + return ioremap(start, len); + return ioremap_nocache(start, len); + } + /* What? */ + return NULL; +} + +void pci_iounmap(struct pci_dev *dev, void __iomem * addr) +{ + IO_COND(addr, /* nothing */, iounmap(addr)); +} +EXPORT_SYMBOL(pci_iomap); +EXPORT_SYMBOL(pci_iounmap); diff --git a/libdde-linux26/lib/src/mach_glue/block.c b/libdde-linux26/lib/src/mach_glue/block.c new file mode 100644 index 00000000..3820712c --- /dev/null +++ b/libdde-linux26/lib/src/mach_glue/block.c @@ -0,0 +1,81 @@ +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/bio.h> +#include <ddekit/assert.h> + +struct gendisk *find_disk_by_name (char *); +void dde_page_cache_add (struct page *); + +struct block_device *open_block_dev (char *name, int part, fmode_t mode) +{ + struct gendisk *disk = find_disk_by_name (name); + if (disk) + { + dev_t devid = MKDEV (disk->major, disk->first_minor + part); + return open_by_devnum (devid, mode); + } + return ERR_PTR(-ENXIO); +} + +/* read or write a piece of data to a block device. + * DATA must be in one page. + * SECTORNR: the writing location in sectors. */ +int block_dev_rw (struct block_device *dev, int sectornr, + char *data, int count, int rw, void (*done (int err))) +{ + int err = 0; + struct bio *bio; + struct page *page; + int i; + + void end_bio (struct bio *bio, int err) + { + done (err); + } + + assert (count <= PAGE_SIZE); + bio = bio_alloc (GFP_NOIO, 1); + if (bio == NULL) + { + err = ENOMEM; + goto out; + } + + page = kmalloc (sizeof (*page), GFP_KERNEL); + if (page == NULL) + { + err = ENOMEM; + goto out; + } + + bio->bi_sector = sectornr; + bio->bi_bdev = dev; + page->virtual = data; + dde_page_cache_add (page); + bio->bi_io_vec[0].bv_page = page; + bio->bi_io_vec[0].bv_len = count; + bio->bi_io_vec[0].bv_offset = (int) data & ~PAGE_MASK; + + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = count; + + bio->bi_end_io = end_bio; + bio->bi_private = NULL; + bio_get (bio); + submit_bio (rw, bio); + if (bio_flagged (bio, BIO_EOPNOTSUPP)) + { + err = -EOPNOTSUPP; + goto out; + } + bio_put (bio); +out: + if (err) + { + if (bio) + bio_put (bio); + kfree (page); + } + return err; +} diff --git a/libdde-linux26/lib/src/mach_glue/net.c b/libdde-linux26/lib/src/mach_glue/net.c new file mode 100644 index 00000000..3ab9e44f --- /dev/null +++ b/libdde-linux26/lib/src/mach_glue/net.c @@ -0,0 +1,112 @@ +#include <linux/netdevice.h> +#include <linux/if.h> + +/* List of sk_buffs waiting to be freed. */ +static struct sk_buff_head skb_done_list; + +struct net_device *search_netdev (char *name) +{ + struct net_device *dev; + struct net_device *found = NULL; + struct net *net; + + printk("search device %s\n", name); + read_lock(&dev_base_lock); + for_each_net(net) { + for_each_netdev(net, dev) { + printk("there is device %s, base addr: %x\n", + dev->name, dev->base_addr); + if (!strcmp (name, dev->name)) + { + found = dev; + goto end; + } + } + } +end: + read_unlock(&dev_base_lock); + return found; +} + +int linux_pkg_xmit (char *pkg_data, int len, void *del_data, + int (*del_func) (struct sk_buff *, void *), + struct net_device *dev) +{ + struct sk_buff *skb; + + if (len == 0 || len > dev->mtu + dev->hard_header_len) + return EINVAL; + + /* Allocate a sk_buff. */ + skb = dev_alloc_skb (len); + if (!skb) + return ENOMEM; + + skb->del_data = del_data; + skb->pre_del_func = del_func; + + /* Copy user data. This is only required if it spans multiple pages. */ + skb->len = len; + skb->tail = skb->data + len; + + memcpy (skb->data, pkg_data, len); + + skb->dev = dev; + + return dev_queue_xmit(skb); +} + +char *netdev_addr(struct net_device *dev) +{ + return dev->dev_addr; +} + +int netdev_flags(struct net_device *dev) +{ + return dev->flags; +} + +char *netdev_name (struct net_device *dev) +{ + return dev->name; +} + +unsigned int netdev_mtu (struct net_device *dev) +{ + return dev->mtu; +} + +unsigned short netdev_header_len (struct net_device *dev) +{ + return dev->hard_header_len; +} + +unsigned short netdev_type (struct net_device *dev) +{ + return dev->type; +} + +unsigned char netdev_addr_len (struct net_device *dev) +{ + return dev->addr_len; +} + +void *skb_reply(struct sk_buff *skb) +{ + return skb->del_data; +} + +void skb_done_head_init() +{ + skb_queue_head_init (&skb_done_list); +} + +struct sk_buff *skb_done_dequeue() +{ + return skb_dequeue (&skb_done_list); +} + +void skb_done_queue(struct sk_buff *skb) +{ + skb_queue_tail (&skb_done_list, skb); +} diff --git a/libdde-linux26/lib/src/mm/memory.c b/libdde-linux26/lib/src/mm/memory.c new file mode 100644 index 00000000..a4d66f50 --- /dev/null +++ b/libdde-linux26/lib/src/mm/memory.c @@ -0,0 +1,3203 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) + * + * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) + */ + +#include <linux/kernel_stat.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#ifndef DDE_LINUX +#include <linux/rmap.h> +#endif +#include <linux/module.h> +#include <linux/delayacct.h> +#include <linux/init.h> +#include <linux/writeback.h> +#include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> +#include <linux/kallsyms.h> +#include <linux/swapops.h> +#include <linux/elf.h> + +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/pgtable.h> + +#include "internal.h" + +#ifndef CONFIG_NEED_MULTIPLE_NODES +/* use the per-pgdat data instead for discontigmem - mbligh */ +unsigned long max_mapnr; +#ifndef DDE_LINUX +struct page *mem_map; +#endif + +EXPORT_SYMBOL(max_mapnr); +#ifndef DDE_LINUX +EXPORT_SYMBOL(mem_map); +#endif +#endif + +unsigned long num_physpages; +/* + * A number of key systems in x86 including ioremap() rely on the assumption + * that high_memory defines the upper bound on direct map memory, then end + * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and + * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ +void * high_memory; + +EXPORT_SYMBOL(num_physpages); +EXPORT_SYMBOL(high_memory); + +/* + * Randomize the address space (stacks, mmaps, brk, etc.). + * + * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, + * as ancient (libc5 based) binaries can segfault. ) + */ +int randomize_va_space __read_mostly = +#ifdef CONFIG_COMPAT_BRK + 1; +#else + 2; +#endif + +#ifndef DDE_LINUX +static int __init disable_randmaps(char *s) +{ + randomize_va_space = 0; + return 1; +} +__setup("norandmaps", disable_randmaps); + + +/* + * If a p?d_bad entry is found while walking page tables, report + * the error, before resetting entry to p?d_none. Usually (but + * very seldom) called out from the p?d_none_or_clear_bad macros. + */ + +void pgd_clear_bad(pgd_t *pgd) +{ + pgd_ERROR(*pgd); + pgd_clear(pgd); +} + +void pud_clear_bad(pud_t *pud) +{ + pud_ERROR(*pud); + pud_clear(pud); +} + +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +{ + pgtable_t token = pmd_pgtable(*pmd); + pmd_clear(pmd); + pte_free_tlb(tlb, token); + tlb->mm->nr_ptes--; +} + +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pmd_t *pmd; + unsigned long next; + unsigned long start; + + start = addr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + free_pte_range(tlb, pmd); + } while (pmd++, addr = next, addr != end); + + start &= PUD_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PUD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd); +} + +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pud_t *pud; + unsigned long next; + unsigned long start; + + start = addr; + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + free_pmd_range(tlb, pud, addr, next, floor, ceiling); + } while (pud++, addr = next, addr != end); + + start &= PGDIR_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PGDIR_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pud = pud_offset(pgd, start); + pgd_clear(pgd); + pud_free_tlb(tlb, pud); +} + +/* + * This function frees user-level page tables of a process. + * + * Must be called with pagetable lock held. + */ +void free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pgd_t *pgd; + unsigned long next; + unsigned long start; + + /* + * The next few lines have given us lots of grief... + * + * Why are we testing PMD* at this top level? Because often + * there will be no work to do at all, and we'd prefer not to + * go all the way down to the bottom just to discover that. + * + * Why all these "- 1"s? Because 0 represents both the bottom + * of the address space and the top of it (using -1 for the + * top wouldn't help much: the masks would do the wrong thing). + * The rule is that addr 0 and floor 0 refer to the bottom of + * the address space, but end 0 and ceiling 0 refer to the top + * Comparisons need to use "end - 1" and "ceiling - 1" (though + * that end 0 case should be mythical). + * + * Wherever addr is brought up or ceiling brought down, we must + * be careful to reject "the opposite 0" before it confuses the + * subsequent tests. But what about where end is brought down + * by PMD_SIZE below? no, end can't go down to 0 there. + * + * Whereas we round start (addr) and ceiling down, by different + * masks at different levels, in order to test whether a table + * now has no other vmas using it, so can be freed, we don't + * bother to round floor or end up - the tests don't need that. + */ + + addr &= PMD_MASK; + if (addr < floor) { + addr += PMD_SIZE; + if (!addr) + return; + } + if (ceiling) { + ceiling &= PMD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + end -= PMD_SIZE; + if (addr > end - 1) + return; + + start = addr; + pgd = pgd_offset(tlb->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + free_pud_range(tlb, pgd, addr, next, floor, ceiling); + } while (pgd++, addr = next, addr != end); +} + +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long floor, unsigned long ceiling) +{ + while (vma) { + struct vm_area_struct *next = vma->vm_next; + unsigned long addr = vma->vm_start; + + /* + * Hide vma from rmap and vmtruncate before freeing pgtables + */ + anon_vma_unlink(vma); + unlink_file_vma(vma); + + if (is_vm_hugetlb_page(vma)) { + hugetlb_free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } else { + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE + && !is_vm_hugetlb_page(next)) { + vma = next; + next = vma->vm_next; + anon_vma_unlink(vma); + unlink_file_vma(vma); + } + free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } + vma = next; + } +} + +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + pgtable_t new = pte_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + /* + * Ensure all pte setup (eg. pte page lock and page clearing) are + * visible before the pte is made visible to other CPUs by being + * put into page tables. + * + * The other side of the story is the pointer chasing in the page + * table walking code (when walking the page table without locking; + * ie. most of the time). Fortunately, these data accesses consist + * of a chain of data-dependent loads, meaning most CPUs (alpha + * being the notable exception) will already guarantee loads are + * seen in-order. See the alpha page table accessors for the + * smp_read_barrier_depends() barriers in page table walking code. + */ + smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ + + spin_lock(&mm->page_table_lock); + if (!pmd_present(*pmd)) { /* Has another populated it ? */ + mm->nr_ptes++; + pmd_populate(mm, pmd, new); + new = NULL; + } + spin_unlock(&mm->page_table_lock); + if (new) + pte_free(mm, new); + return 0; +} + +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) +{ + pte_t *new = pte_alloc_one_kernel(&init_mm, address); + if (!new) + return -ENOMEM; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&init_mm.page_table_lock); + if (!pmd_present(*pmd)) { /* Has another populated it ? */ + pmd_populate_kernel(&init_mm, pmd, new); + new = NULL; + } + spin_unlock(&init_mm.page_table_lock); + if (new) + pte_free_kernel(&init_mm, new); + return 0; +} + +static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +{ + if (file_rss) + add_mm_counter(mm, file_rss, file_rss); + if (anon_rss) + add_mm_counter(mm, anon_rss, anon_rss); +} + +/* + * This function is called to print an error when a bad pte + * is found. For example, we might have a PFN-mapped pte in + * a region that doesn't allow it. + * + * The calling function must still handle the error. + */ +static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, struct page *page) +{ + pgd_t *pgd = pgd_offset(vma->vm_mm, addr); + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + struct address_space *mapping; + pgoff_t index; + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + return; + } + if (nr_unshown) { + printk(KERN_ALERT + "BUG: Bad page map: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; + + mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; + index = linear_page_index(vma, addr); + + printk(KERN_ALERT + "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); + if (page) { + printk(KERN_ALERT + "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", + page, (void *)page->flags, page_count(page), + page_mapcount(page), page->mapping, page->index); + } + printk(KERN_ALERT + "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + /* + * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y + */ + if (vma->vm_ops) + print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", + (unsigned long)vma->vm_ops->fault); + if (vma->vm_file && vma->vm_file->f_op) + print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", + (unsigned long)vma->vm_file->f_op->mmap); + dump_stack(); + add_taint(TAINT_BAD_PAGE); +} + +static inline int is_cow_mapping(unsigned int flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + +/* + * vm_normal_page -- This function gets the "struct page" associated with a pte. + * + * "Special" mappings do not wish to be associated with a "struct page" (either + * it doesn't exist, or it exists but they don't want to touch it). In this + * case, NULL is returned here. "Normal" mappings do have a struct page. + * + * There are 2 broad cases. Firstly, an architecture may define a pte_special() + * pte bit, in which case this function is trivial. Secondly, an architecture + * may not have a spare pte bit, which requires a more complicated scheme, + * described below. + * + * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a + * special mapping (even if there are underlying and valid "struct pages"). + * COWed pages of a VM_PFNMAP are always normal. + * + * The way we recognize COWed pages within VM_PFNMAP mappings is through the + * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit + * set, and the vm_pgoff will point to the first PFN mapped: thus every special + * mapping will always honor the rule + * + * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) + * + * And for normal mappings this is false. + * + * This restricts such mappings to be a linear translation from virtual address + * to pfn. To get around this restriction, we allow arbitrary mappings so long + * as the vma is not a COW mapping; in that case, we know that all ptes are + * special (because none can have been COWed). + * + * + * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. + * + * VM_MIXEDMAP mappings can likewise contain memory with or without "struct + * page" backing, however the difference is that _all_ pages with a struct + * page (that is, those where pfn_valid is true) are refcounted and considered + * normal pages by the VM. The disadvantage is that pages are refcounted + * (which can be slower and simply not an option for some PFNMAP users). The + * advantage is that we don't have to follow the strict linearity rule of + * PFNMAP mappings in order to support COWable mappings. + * + */ +#ifdef __HAVE_ARCH_PTE_SPECIAL +# define HAVE_PTE_SPECIAL 1 +#else +# define HAVE_PTE_SPECIAL 0 +#endif +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + if (HAVE_PTE_SPECIAL) { + if (likely(!pte_special(pte))) + goto check_pfn; + if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) + print_bad_pte(vma, addr, pte, NULL); + return NULL; + } + + /* !HAVE_PTE_SPECIAL case follows: */ + + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + +check_pfn: + if (unlikely(pfn > highest_memmap_pfn)) { + print_bad_pte(vma, addr, pte, NULL); + return NULL; + } +#endif + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} + +/* + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. + */ + +static inline void +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, + unsigned long addr, int *rss) +{ + unsigned long vm_flags = vma->vm_flags; + pte_t pte = *src_pte; + struct page *page; + + /* pte contains position in swap or file, so copy. */ + if (unlikely(!pte_present(pte))) { + if (!pte_file(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + swap_duplicate(entry); + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both parent + * and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } + } + goto out_set_pte; + } + + /* + * If it's a COW mapping, write protect it both + * in the parent and the child + */ + if (is_cow_mapping(vm_flags)) { + ptep_set_wrprotect(src_mm, addr, src_pte); + pte = pte_wrprotect(pte); + } + + /* + * If it's a shared mapping, mark it clean in + * the child + */ + if (vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + + page = vm_normal_page(vma, addr, pte); + if (page) { + get_page(page); + page_dup_rmap(page, vma, addr); + rss[!!PageAnon(page)]++; + } + +out_set_pte: + set_pte_at(dst_mm, addr, dst_pte, pte); +} + +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pte_t *src_pte, *dst_pte; + spinlock_t *src_ptl, *dst_ptl; + int progress = 0; + int rss[2]; + +again: + rss[1] = rss[0] = 0; + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + if (!dst_pte) + return -ENOMEM; + src_pte = pte_offset_map_nested(src_pmd, addr); + src_ptl = pte_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); + + do { + /* + * We are holding two locks at this point - either of them + * could generate latencies in another task on another CPU. + */ + if (progress >= 32) { + progress = 0; + if (need_resched() || + spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) + break; + } + if (pte_none(*src_pte)) { + progress++; + continue; + } + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + progress += 8; + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); + spin_unlock(src_ptl); + pte_unmap_nested(src_pte - 1); + add_mm_rss(dst_mm, rss[0], rss[1]); + pte_unmap_unlock(dst_pte - 1, dst_ptl); + cond_resched(); + if (addr != end) + goto again; + return 0; +} + +static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pmd_t *src_pmd, *dst_pmd; + unsigned long next; + + dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); + if (!dst_pmd) + return -ENOMEM; + src_pmd = pmd_offset(src_pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(src_pmd)) + continue; + if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, + vma, addr, next)) + return -ENOMEM; + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + return 0; +} + +static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pud_t *src_pud, *dst_pud; + unsigned long next; + + dst_pud = pud_alloc(dst_mm, dst_pgd, addr); + if (!dst_pud) + return -ENOMEM; + src_pud = pud_offset(src_pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(src_pud)) + continue; + if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, + vma, addr, next)) + return -ENOMEM; + } while (dst_pud++, src_pud++, addr = next, addr != end); + return 0; +} + +int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + struct vm_area_struct *vma) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long next; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + int ret; + + /* + * Don't copy ptes where a page fault will fill them correctly. + * Fork becomes much lighter when there are big shared or private + * readonly mappings. The tradeoff is that copy_page_range is more + * efficient than faulting. + */ + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { + if (!vma->anon_vma) + return 0; + } + + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst_mm, src_mm, vma); + + if (unlikely(is_pfn_mapping(vma))) { + /* + * We do not free on error cases below as remove_vma + * gets called on error from higher level routine + */ + ret = track_pfn_vma_copy(vma); + if (ret) + return ret; + } + + /* + * We need to invalidate the secondary MMU mappings only when + * there could be a permission downgrade on the ptes of the + * parent mm. And a permission downgrade will only happen if + * is_cow_mapping() returns true. + */ + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_start(src_mm, addr, end); + + ret = 0; + dst_pgd = pgd_offset(dst_mm, addr); + src_pgd = pgd_offset(src_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(src_pgd)) + continue; + if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next))) { + ret = -ENOMEM; + break; + } + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_end(src_mm, + vma->vm_start, end); + return ret; +} + +static unsigned long zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + struct mm_struct *mm = tlb->mm; + pte_t *pte; + spinlock_t *ptl; + int file_rss = 0; + int anon_rss = 0; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + do { + pte_t ptent = *pte; + if (pte_none(ptent)) { + (*zap_work)--; + continue; + } + + (*zap_work) -= PAGE_SIZE; + + if (pte_present(ptent)) { + struct page *page; + + page = vm_normal_page(vma, addr, ptent); + if (unlikely(details) && page) { + /* + * unmap_shared_mapping_pages() wants to + * invalidate cache without truncating: + * unmap shared but keep private pages. + */ + if (details->check_mapping && + details->check_mapping != page->mapping) + continue; + /* + * Each page->index must be checked when + * invalidating or truncating nonlinear. + */ + if (details->nonlinear_vma && + (page->index < details->first_index || + page->index > details->last_index)) + continue; + } + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); + if (unlikely(!page)) + continue; + if (unlikely(details) && details->nonlinear_vma + && linear_page_index(details->nonlinear_vma, + addr) != page->index) + set_pte_at(mm, addr, pte, + pgoff_to_pte(page->index)); + if (PageAnon(page)) + anon_rss--; + else { + if (pte_dirty(ptent)) + set_page_dirty(page); + if (pte_young(ptent) && + likely(!VM_SequentialReadHint(vma))) + mark_page_accessed(page); + file_rss--; + } + page_remove_rmap(page); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + tlb_remove_page(tlb, page); + continue; + } + /* + * If details->check_mapping, we leave swap entries; + * if details->nonlinear_vma, we leave file entries. + */ + if (unlikely(details)) + continue; + if (pte_file(ptent)) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) + print_bad_pte(vma, addr, ptent, NULL); + } else if + (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) + print_bad_pte(vma, addr, ptent, NULL); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + + add_mm_rss(mm, file_rss, anon_rss); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); + + return addr; +} + +static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) { + (*zap_work)--; + continue; + } + next = zap_pte_range(tlb, vma, pmd, addr, next, + zap_work, details); + } while (pmd++, addr = next, (addr != end && *zap_work > 0)); + + return addr; +} + +static inline unsigned long zap_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) { + (*zap_work)--; + continue; + } + next = zap_pmd_range(tlb, vma, pud, addr, next, + zap_work, details); + } while (pud++, addr = next, (addr != end && *zap_work > 0)); + + return addr; +} + +static unsigned long unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + pgd_t *pgd; + unsigned long next; + + if (details && !details->check_mapping && !details->nonlinear_vma) + details = NULL; + + BUG_ON(addr >= end); + tlb_start_vma(tlb, vma); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + (*zap_work)--; + continue; + } + next = zap_pud_range(tlb, vma, pgd, addr, next, + zap_work, details); + } while (pgd++, addr = next, (addr != end && *zap_work > 0)); + tlb_end_vma(tlb, vma); + + return addr; +} + +#ifdef CONFIG_PREEMPT +# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) +#else +/* No preempt: go for improved straight-line efficiency */ +# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) +#endif + +/** + * unmap_vmas - unmap a range of memory covered by a list of vma's + * @tlbp: address of the caller's struct mmu_gather + * @vma: the starting vma + * @start_addr: virtual address at which to start unmapping + * @end_addr: virtual address at which to end unmapping + * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here + * @details: details of nonlinear truncation or shared cache invalidation + * + * Returns the end address of the unmapping (restart addr if interrupted). + * + * Unmap all pages in the vma list. + * + * We aim to not hold locks for too long (for scheduling latency reasons). + * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to + * return the ending mmu_gather to the caller. + * + * Only addresses between `start' and `end' will be unmapped. + * + * The VMA list must be sorted in ascending virtual address order. + * + * unmap_vmas() assumes that the caller will flush the whole unmapped address + * range after unmap_vmas() returns. So the only responsibility here is to + * ensure that any thus-far unmapped pages are flushed before unmap_vmas() + * drops the lock and schedules. + */ +unsigned long unmap_vmas(struct mmu_gather **tlbp, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted, + struct zap_details *details) +{ + long zap_work = ZAP_BLOCK_SIZE; + unsigned long tlb_start = 0; /* For tlb_finish_mmu */ + int tlb_start_valid = 0; + unsigned long start = start_addr; + spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; + int fullmm = (*tlbp)->fullmm; + struct mm_struct *mm = vma->vm_mm; + + mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); + for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { + unsigned long end; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + continue; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + continue; + + if (vma->vm_flags & VM_ACCOUNT) + *nr_accounted += (end - start) >> PAGE_SHIFT; + + if (unlikely(is_pfn_mapping(vma))) + untrack_pfn_vma(vma, 0, 0); + + while (start != end) { + if (!tlb_start_valid) { + tlb_start = start; + tlb_start_valid = 1; + } + + if (unlikely(is_vm_hugetlb_page(vma))) { + /* + * It is undesirable to test vma->vm_file as it + * should be non-null for valid hugetlb area. + * However, vm_file will be NULL in the error + * cleanup path of do_mmap_pgoff. When + * hugetlbfs ->mmap method fails, + * do_mmap_pgoff() nullifies vma->vm_file + * before calling this function to clean up. + * Since no pte has actually been setup, it is + * safe to do nothing in this case. + */ + if (vma->vm_file) { + unmap_hugepage_range(vma, start, end, NULL); + zap_work -= (end - start) / + pages_per_huge_page(hstate_vma(vma)); + } + + start = end; + } else + start = unmap_page_range(*tlbp, vma, + start, end, &zap_work, details); + + if (zap_work > 0) { + BUG_ON(start != end); + break; + } + + tlb_finish_mmu(*tlbp, tlb_start, start); + + if (need_resched() || + (i_mmap_lock && spin_needbreak(i_mmap_lock))) { + if (i_mmap_lock) { + *tlbp = NULL; + goto out; + } + cond_resched(); + } + + *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); + tlb_start_valid = 0; + zap_work = ZAP_BLOCK_SIZE; + } + } +out: + mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); + return start; /* which is now the end (or restart) address */ +} + +/** + * zap_page_range - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of nonlinear truncation or shared cache invalidation + */ +unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather *tlb; + unsigned long end = address + size; + unsigned long nr_accounted = 0; + + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); + end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); + if (tlb) + tlb_finish_mmu(tlb, address, end); + return end; +} + +/** + * zap_vma_ptes - remove ptes mapping the vma + * @vma: vm_area_struct holding ptes to be zapped + * @address: starting address of pages to zap + * @size: number of bytes to zap + * + * This function only unmaps ptes assigned to VM_PFNMAP vmas. + * + * The entire address range must be fully contained within the vma. + * + * Returns 0 if successful. + */ +int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, + unsigned long size) +{ + if (address < vma->vm_start || address + size > vma->vm_end || + !(vma->vm_flags & VM_PFNMAP)) + return -1; + zap_page_range(vma, address, size, NULL); + return 0; +} +EXPORT_SYMBOL_GPL(zap_vma_ptes); + +/* + * Do a quick page-table lookup for a single page. + */ +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + goto out; + } + + page = NULL; + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + goto no_page_table; + if (pud_huge(*pud)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + goto out; + } + if (unlikely(pud_bad(*pud))) + goto no_page_table; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto no_page_table; + if (pmd_huge(*pmd)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + goto out; + } + if (unlikely(pmd_bad(*pmd))) + goto no_page_table; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + + pte = *ptep; + if (!pte_present(pte)) + goto no_page; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) + goto bad_page; + + if (flags & FOLL_GET) + get_page(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); + } +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return page; + +bad_page: + pte_unmap_unlock(ptep, ptl); + return ERR_PTR(-EFAULT); + +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return page; + /* Fall through to ZERO_PAGE handling */ +no_page_table: + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate page tables. + */ + if (flags & FOLL_ANON) { + page = ZERO_PAGE(0); + if (flags & FOLL_GET) + get_page(page); + BUG_ON(flags & FOLL_WRITE); + } + return page; +} + +/* Can we do the FOLL_ANON optimization? */ +static inline int use_zero_page(struct vm_area_struct *vma) +{ + /* + * We don't want to optimize FOLL_ANON for make_pages_present() + * when it tries to page in a VM_LOCKED region. As to VM_SHARED, + * we want to get the page from the page tables to make sure + * that we serialize and update with any other user of that + * mapping. + */ + if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) + return 0; + /* + * And if we have a fault routine, it's not an anonymous region. + */ + return !vma->vm_ops || !vma->vm_ops->fault; +} + + + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas) +{ + int i; + unsigned int vm_flags = 0; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); + int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); + + if (len <= 0) + return 0; + /* + * Require read or write permissions. + * If 'force' is set, we only require the "MAY" flags. + */ + vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + i = 0; + + do { + struct vm_area_struct *vma; + unsigned int foll_flags; + + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(tsk, start)) { + unsigned long pg = start & PAGE_MASK; + struct vm_area_struct *gate_vma = get_gate_vma(tsk); + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* user gate pages are read-only */ + if (!ignore && write) + return i ? : -EFAULT; + if (pg > TASK_SIZE) + pgd = pgd_offset_k(pg); + else + pgd = pgd_offset_gate(mm, pg); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, pg); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, pg); + if (pmd_none(*pmd)) + return i ? : -EFAULT; + pte = pte_offset_map(pmd, pg); + if (pte_none(*pte)) { + pte_unmap(pte); + return i ? : -EFAULT; + } + if (pages) { + struct page *page = vm_normal_page(gate_vma, start, *pte); + pages[i] = page; + if (page) + get_page(page); + } + pte_unmap(pte); + if (vmas) + vmas[i] = gate_vma; + i++; + start += PAGE_SIZE; + len--; + continue; + } + + if (!vma || + (vma->vm_flags & (VM_IO | VM_PFNMAP)) || + (!ignore && !(vm_flags & vma->vm_flags))) + return i ? : -EFAULT; + + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &len, i, write); + continue; + } + + foll_flags = FOLL_TOUCH; + if (pages) + foll_flags |= FOLL_GET; + if (!write && use_zero_page(vma)) + foll_flags |= FOLL_ANON; + + do { + struct page *page; + + /* + * If we have a pending SIGKILL, don't keep faulting + * pages and potentially allocating memory, unless + * current is handling munlock--e.g., on exit. In + * that case, we are not allocating memory. Rather, + * we're only unlocking already resident/mapped pages. + */ + if (unlikely(!ignore_sigkill && + fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; + + if (write) + foll_flags |= FOLL_WRITE; + + cond_resched(); + while (!(page = follow_page(vma, start, foll_flags))) { + int ret; + ret = handle_mm_fault(mm, vma, start, + foll_flags & FOLL_WRITE); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return i ? i : -ENOMEM; + else if (ret & VM_FAULT_SIGBUS) + return i ? i : -EFAULT; + BUG(); + } + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + + /* + * The VM_FAULT_WRITE bit tells us that + * do_wp_page has broken COW when necessary, + * even if maybe_mkwrite decided not to set + * pte_write. We can thus safely do subsequent + * page lookups as if they were reads. But only + * do so when looping for pte_write is futile: + * in some cases userspace may also be wanting + * to write to the gotten user page, which a + * read fault here might prevent (a readonly + * page might get reCOWed by userspace write). + */ + if ((ret & VM_FAULT_WRITE) && + !(vma->vm_flags & VM_WRITE)) + foll_flags &= ~FOLL_WRITE; + + cond_resched(); + } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); + if (pages) { + pages[i] = page; + + flush_anon_page(vma, page, start); + flush_dcache_page(page); + } + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + } while (len && start < vma->vm_end); + } while (len); + return i; +} + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} + +EXPORT_SYMBOL(get_user_pages); + +pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl) +{ + pgd_t * pgd = pgd_offset(mm, addr); + pud_t * pud = pud_alloc(mm, pgd, addr); + if (pud) { + pmd_t * pmd = pmd_alloc(mm, pud, addr); + if (pmd) + return pte_alloc_map_lock(mm, pmd, addr, ptl); + } + return NULL; +} + +/* + * This is the old fallback for page remapping. + * + * For historical reasons, it only allows reserved pages. Only + * old drivers should use this, and they needed to mark their + * pages reserved for the old functions anyway. + */ +static int insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page, pgprot_t prot) +{ + struct mm_struct *mm = vma->vm_mm; + int retval; + pte_t *pte; + spinlock_t *ptl; + + retval = -EINVAL; + if (PageAnon(page)) + goto out; + retval = -ENOMEM; + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = -EBUSY; + if (!pte_none(*pte)) + goto out_unlock; + + /* Ok, finally just insert the thing.. */ + get_page(page); + inc_mm_counter(mm, file_rss); + page_add_file_rmap(page); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); + + retval = 0; + pte_unmap_unlock(pte, ptl); + return retval; +out_unlock: + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +/** + * vm_insert_page - insert single page into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @page: source kernel page + * + * This allows drivers to insert individual pages they've allocated + * into a user vma. + * + * The page has to be a nice clean _individual_ kernel allocation. + * If you allocate a compound page, you need to have marked it as + * such (__GFP_COMP), or manually just split the page up yourself + * (see split_page()). + * + * NOTE! Traditionally this was done with "remap_pfn_range()" which + * took an arbitrary page protection parameter. This doesn't allow + * that. Your vma protection will have to be set up correctly, which + * means that if you want a shared writable mapping, you'd better + * ask for a shared writable mapping! + * + * The page does not need to be reserved. + */ +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) +{ + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (!page_count(page)) + return -EINVAL; + vma->vm_flags |= VM_INSERTPAGE; + return insert_page(vma, addr, page, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_page); + +static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) +{ + struct mm_struct *mm = vma->vm_mm; + int retval; + pte_t *pte, entry; + spinlock_t *ptl; + + retval = -ENOMEM; + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = -EBUSY; + if (!pte_none(*pte)) + goto out_unlock; + + /* Ok, finally just insert the thing.. */ + entry = pte_mkspecial(pfn_pte(pfn, prot)); + set_pte_at(mm, addr, pte, entry); + update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ + + retval = 0; +out_unlock: + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +/** + * vm_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_inert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return NULL. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + */ +int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + int ret; + pgprot_t pgprot = vma->vm_page_prot; + /* + * Technically, architectures with pte_special can avoid all these + * restrictions (same for remap_pfn_range). However we would like + * consistency in testing and feature parity among all, so we should + * try to keep these invariants in place for everybody. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) + return -EINVAL; + + ret = insert_pfn(vma, addr, pfn, pgprot); + + if (ret) + untrack_pfn_vma(vma, pfn, PAGE_SIZE); + + return ret; +} +EXPORT_SYMBOL(vm_insert_pfn); + +int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + + /* + * If we don't have pte special, then we have to use the pfn_valid() + * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* + * refcount the page if pfn_valid is true (hence insert_page rather + * than insert_pfn). + */ + if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { + struct page *page; + + page = pfn_to_page(pfn); + return insert_page(vma, addr, page, vma->vm_page_prot); + } + return insert_pfn(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_mixed); + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pte_t *pte; + spinlock_t *ptl; + + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + arch_enter_lazy_mmu_mode(); + do { + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); + return 0; +} + +static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pmd_t *pmd; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + if (remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pud_t *pud; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target user address to start at + * @pfn: physical address of kernel memory + * @size: size of map area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + PAGE_ALIGN(size); + struct mm_struct *mm = vma->vm_mm; + int err; + + /* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_RESERVED is specified all over the place, because + * in 2.4 it kept swapout's vma scan off this vma; but + * in 2.6 the LRU scan won't even find its pages, so this + * flag means no more than count its pages in reserved_vm, + * and omit it from core dump, even when VM_IO turned off. + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + */ + if (addr == vma->vm_start && end == vma->vm_end) + vma->vm_pgoff = pfn; + else if (is_cow_mapping(vma->vm_flags)) + return -EINVAL; + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + + err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); + if (err) { + /* + * To indicate that track_pfn related cleanup is not + * needed from higher level routine calling unmap_vmas + */ + vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); + return -EINVAL; + } + + BUG_ON(addr >= end); + pfn -= addr >> PAGE_SHIFT; + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + do { + next = pgd_addr_end(addr, end); + err = remap_pud_range(mm, pgd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + if (err) + untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); + + return err; +} +EXPORT_SYMBOL(remap_pfn_range); + +static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pte_t *pte; + int err; + pgtable_t token; + spinlock_t *uninitialized_var(ptl); + + pte = (mm == &init_mm) ? + pte_alloc_kernel(pmd, addr) : + pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + + BUG_ON(pmd_huge(*pmd)); + + arch_enter_lazy_mmu_mode(); + + token = pmd_pgtable(*pmd); + + do { + err = fn(pte, token, addr, data); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); + + if (mm != &init_mm) + pte_unmap_unlock(pte-1, ptl); + return err; +} + +static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pmd_t *pmd; + unsigned long next; + int err; + + BUG_ON(pud_huge(*pud)); + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (err) + break; + } while (pmd++, addr = next, addr != end); + return err; +} + +static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pud_t *pud; + unsigned long next; + int err; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + err = apply_to_pmd_range(mm, pud, addr, next, fn, data); + if (err) + break; + } while (pud++, addr = next, addr != end); + return err; +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + pgd_t *pgd; + unsigned long next; + unsigned long start = addr, end = addr + size; + int err; + + BUG_ON(addr >= end); + mmu_notifier_invalidate_range_start(mm, start, end); + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + err = apply_to_pud_range(mm, pgd, addr, next, fn, data); + if (err) + break; + } while (pgd++, addr = next, addr != end); + mmu_notifier_invalidate_range_end(mm, start, end); + return err; +} +EXPORT_SYMBOL_GPL(apply_to_page_range); + +/* + * handle_pte_fault chooses page fault handler according to an entry + * which was read non-atomically. Before making any commitment, on + * those architectures or configurations (e.g. i386 with PAE) which + * might give a mix of unmatched parts, do_swap_page and do_file_page + * must check under lock before unmapping the pte and proceeding + * (but do_wp_page is only called after already making such a check; + * and do_anonymous_page and do_no_page can safely check later on). + */ +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, + pte_t *page_table, pte_t orig_pte) +{ + int same = 1; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) + if (sizeof(pte_t) > sizeof(unsigned long)) { + spinlock_t *ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + same = pte_same(*page_table, orig_pte); + spin_unlock(ptl); + } +#endif + pte_unmap(page_table); + return same; +} + +/* + * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when + * servicing faults for write access. In the normal case, do always want + * pte_mkwrite. But get_user_pages can cause write faults for mappings + * that do not have writing enabled, when used by access_process_vm. + */ +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pte = pte_mkwrite(pte); + return pte; +} + +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) +{ + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + if (unlikely(!src)) { + void *kaddr = kmap_atomic(dst, KM_USER0); + void __user *uaddr = (void __user *)(va & PAGE_MASK); + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) + memset(kaddr, 0, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(dst); + } else + copy_user_highpage(dst, src, va, vma); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), with pte both mapped and locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + spinlock_t *ptl, pte_t orig_pte) +{ + struct page *old_page, *new_page; + pte_t entry; + int reuse = 0, ret = 0; + int page_mkwrite = 0; + struct page *dirty_page = NULL; + + old_page = vm_normal_page(vma, address, orig_pte); + if (!old_page) { + /* + * VM_MIXEDMAP !pfn_valid() case + * + * We should not cow pages in a shared writeable mapping. + * Just mark the pages writable as we can't do any dirty + * accounting on raw pfn maps. + */ + if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED)) + goto reuse; + goto gotten; + } + + /* + * Take out anonymous pages first, anonymous shared vmas are + * not dirty accountable. + */ + if (PageAnon(old_page)) { + if (!trylock_page(old_page)) { + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + lock_page(old_page); + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + page_cache_release(old_page); + goto unlock; + } + page_cache_release(old_page); + } + reuse = reuse_swap_page(old_page); + unlock_page(old_page); + } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED))) { + /* + * Only catch write-faults on shared writable pages, + * read-only shared pages can get COWed by + * get_user_pages(.write=1, .force=1). + */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + /* + * Notify the address space that the page is about to + * become writable so that it can prohibit this or wait + * for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can + * sleep if it needs to. + */ + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + + if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) + goto unwritable_page; + + /* + * Since we dropped the lock we need to revalidate + * the PTE as someone else may have changed it. If + * they did, we just return, as we can count on the + * MMU to tell us if they didn't also make it writable. + */ + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + page_cache_release(old_page); + if (!pte_same(*page_table, orig_pte)) + goto unlock; + + page_mkwrite = 1; + } + dirty_page = old_page; + get_page(dirty_page); + reuse = 1; + } + + if (reuse) { +reuse: + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (ptep_set_access_flags(vma, address, page_table, entry,1)) + update_mmu_cache(vma, address, entry); + ret |= VM_FAULT_WRITE; + goto unlock; + } + + /* + * Ok, we need to copy. Oh, well.. + */ + page_cache_get(old_page); +gotten: + pte_unmap_unlock(page_table, ptl); + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + VM_BUG_ON(old_page == ZERO_PAGE(0)); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + goto oom; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if ((vma->vm_flags & VM_LOCKED) && old_page) { + lock_page(old_page); /* for LRU manipulation */ + clear_page_mlock(old_page); + unlock_page(old_page); + } + cow_user_page(new_page, old_page, address, vma); + __SetPageUptodate(new_page); + + if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) + goto oom_free_new; + + /* + * Re-check the pte - we dropped the lock + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) { + if (old_page) { + if (!PageAnon(old_page)) { + dec_mm_counter(mm, file_rss); + inc_mm_counter(mm, anon_rss); + } + } else + inc_mm_counter(mm, anon_rss); + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + /* + * Clear the pte entry and flush it first, before updating the + * pte with the new entry. This will avoid a race condition + * seen in the presence of one thread doing SMC and another + * thread doing COW. + */ + ptep_clear_flush_notify(vma, address, page_table); + page_add_new_anon_rmap(new_page, vma, address); + set_pte_at(mm, address, page_table, entry); + update_mmu_cache(vma, address, entry); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page); + } + + /* Free the old page.. */ + new_page = old_page; + ret |= VM_FAULT_WRITE; + } else + mem_cgroup_uncharge_page(new_page); + + if (new_page) + page_cache_release(new_page); + if (old_page) + page_cache_release(old_page); +unlock: + pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + if (vma->vm_file) + file_update_time(vma->vm_file); + + /* + * Yes, Virginia, this is actually required to prevent a race + * with clear_page_dirty_for_io() from clearing the page dirty + * bit after it clear all dirty ptes, but before a racing + * do_wp_page installs a dirty pte. + * + * do_no_page is protected similarly. + */ + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + put_page(dirty_page); + } + return ret; +oom_free_new: + page_cache_release(new_page); +oom: + if (old_page) + page_cache_release(old_page); + return VM_FAULT_OOM; + +unwritable_page: + page_cache_release(old_page); + return VM_FAULT_SIGBUS; +} + +/* + * Helper functions for unmap_mapping_range(). + * + * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ + * + * We have to restart searching the prio_tree whenever we drop the lock, + * since the iterator is only valid while the lock is held, and anyway + * a later vma might be split and reinserted earlier while lock dropped. + * + * The list of nonlinear vmas could be handled more efficiently, using + * a placeholder, but handle it in the same way until a need is shown. + * It is important to search the prio_tree before nonlinear list: a vma + * may become nonlinear and be shifted from prio_tree to nonlinear list + * while the lock is dropped; but never shifted from list to prio_tree. + * + * In order to make forward progress despite restarting the search, + * vm_truncate_count is used to mark a vma as now dealt with, so we can + * quickly skip it next time around. Since the prio_tree search only + * shows us those vmas affected by unmapping the range in question, we + * can't efficiently keep all vmas in step with mapping->truncate_count: + * so instead reset them all whenever it wraps back to 0 (then go to 1). + * mapping->truncate_count and vma->vm_truncate_count are protected by + * i_mmap_lock. + * + * In order to make forward progress despite repeatedly restarting some + * large vma, note the restart_addr from unmap_vmas when it breaks out: + * and restart from that address when we reach that vma again. It might + * have been split or merged, shrunk or extended, but never shifted: so + * restart_addr remains valid so long as it remains in the vma's range. + * unmap_mapping_range forces truncate_count to leap over page-aligned + * values so we can save vma's restart_addr in its truncate_count field. + */ +#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) + +static void reset_vma_truncate_counts(struct address_space *mapping) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) + vma->vm_truncate_count = 0; + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + vma->vm_truncate_count = 0; +} + +static int unmap_mapping_range_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, + struct zap_details *details) +{ + unsigned long restart_addr; + int need_break; + + /* + * files that support invalidating or truncating portions of the + * file from under mmaped areas must have their ->fault function + * return a locked page (and set VM_FAULT_LOCKED in the return). + * This provides synchronisation against concurrent unmapping here. + */ + +again: + restart_addr = vma->vm_truncate_count; + if (is_restart_addr(restart_addr) && start_addr < restart_addr) { + start_addr = restart_addr; + if (start_addr >= end_addr) { + /* Top of vma has been split off since last time */ + vma->vm_truncate_count = details->truncate_count; + return 0; + } + } + + restart_addr = zap_page_range(vma, start_addr, + end_addr - start_addr, details); + need_break = need_resched() || spin_needbreak(details->i_mmap_lock); + + if (restart_addr >= end_addr) { + /* We have now completed this vma: mark it so */ + vma->vm_truncate_count = details->truncate_count; + if (!need_break) + return 0; + } else { + /* Note restart_addr in vma's truncate_count field */ + vma->vm_truncate_count = restart_addr; + if (!need_break) + goto again; + } + + spin_unlock(details->i_mmap_lock); + cond_resched(); + spin_lock(details->i_mmap_lock); + return -EINTR; +} + +static inline void unmap_mapping_range_tree(struct prio_tree_root *root, + struct zap_details *details) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + pgoff_t vba, vea, zba, zea; + +restart: + vma_prio_tree_foreach(vma, &iter, root, + details->first_index, details->last_index) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) + continue; + + vba = vma->vm_pgoff; + vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; + /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ + zba = details->first_index; + if (zba < vba) + zba = vba; + zea = details->last_index; + if (zea > vea) + zea = vea; + + if (unmap_mapping_range_vma(vma, + ((zba - vba) << PAGE_SHIFT) + vma->vm_start, + ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, + details) < 0) + goto restart; + } +} + +static inline void unmap_mapping_range_list(struct list_head *head, + struct zap_details *details) +{ + struct vm_area_struct *vma; + + /* + * In nonlinear VMAs there is no correspondence between virtual address + * offset and file offset. So we must perform an exhaustive search + * across *all* the pages in each nonlinear VMA, not just the pages + * whose virtual address lies outside the file truncation point. + */ +restart: + list_for_each_entry(vma, head, shared.vm_set.list) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) + continue; + details->nonlinear_vma = vma; + if (unmap_mapping_range_vma(vma, vma->vm_start, + vma->vm_end, details) < 0) + goto restart; + } +} + +/** + * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. + * @mapping: the address space containing mmaps to be unmapped. + * @holebegin: byte in first page to unmap, relative to the start of + * the underlying file. This will be rounded down to a PAGE_SIZE + * boundary. Note that this is different from vmtruncate(), which + * must keep the partial page. In contrast, we must get rid of + * partial pages. + * @holelen: size of prospective hole in bytes. This will be rounded + * up to a PAGE_SIZE boundary. A holelen of zero truncates to the + * end of the file. + * @even_cows: 1 when truncating a file, unmap even private COWed pages; + * but 0 when invalidating pagecache, don't throw away private data. + */ +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) +{ + struct zap_details details; + pgoff_t hba = holebegin >> PAGE_SHIFT; + pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* Check for overflow. */ + if (sizeof(holelen) > sizeof(hlen)) { + long long holeend = + (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (holeend & ~(long long)ULONG_MAX) + hlen = ULONG_MAX - hba + 1; + } + + details.check_mapping = even_cows? NULL: mapping; + details.nonlinear_vma = NULL; + details.first_index = hba; + details.last_index = hba + hlen - 1; + if (details.last_index < details.first_index) + details.last_index = ULONG_MAX; + details.i_mmap_lock = &mapping->i_mmap_lock; + + spin_lock(&mapping->i_mmap_lock); + + /* Protect against endless unmapping loops */ + mapping->truncate_count++; + if (unlikely(is_restart_addr(mapping->truncate_count))) { + if (mapping->truncate_count == 0) + reset_vma_truncate_counts(mapping); + mapping->truncate_count++; + } + details.truncate_count = mapping->truncate_count; + + if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) + unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); + spin_unlock(&mapping->i_mmap_lock); +} +EXPORT_SYMBOL(unmap_mapping_range); + +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode * inode, loff_t offset) +{ + if (inode->i_size < offset) { + unsigned long limit; + + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + i_size_write(inode, offset); + } else { + struct address_space *mapping = inode->i_mapping; + + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + i_size_write(inode, offset); + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + } + + if (inode->i_op->truncate) + inode->i_op->truncate(inode); + return 0; + +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +} +EXPORT_SYMBOL(vmtruncate); + +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + + /* + * If the underlying filesystem is not going to provide + * a way to truncate a range of blocks (punch a hole) - + * we should return failure right now. + */ + if (!inode->i_op->truncate_range) + return -ENOSYS; + + mutex_lock(&inode->i_mutex); + down_write(&inode->i_alloc_sem); + unmap_mapping_range(mapping, offset, (end - offset), 1); + truncate_inode_pages_range(mapping, offset, end); + unmap_mapping_range(mapping, offset, (end - offset), 1); + inode->i_op->truncate_range(inode, offset, end); + up_write(&inode->i_alloc_sem); + mutex_unlock(&inode->i_mutex); + + return 0; +} + +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + spinlock_t *ptl; + struct page *page; + swp_entry_t entry; + pte_t pte; + struct mem_cgroup *ptr = NULL; + int ret = 0; + + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) + goto out; + + entry = pte_to_swp_entry(orig_pte); + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + goto out; + } + delayacct_set_flag(DELAYACCT_PF_SWAPIN); + page = lookup_swap_cache(entry); + if (!page) { + grab_swap_token(); /* Contend for token _before_ read-in */ + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, address); + if (!page) { + /* + * Back out if somebody else faulted in this pte + * while we released the pte lock. + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) + ret = VM_FAULT_OOM; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + goto unlock; + } + + /* Had to read the page from swap area: Major fault */ + ret = VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + } + + mark_page_accessed(page); + + lock_page(page); + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + + if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { + ret = VM_FAULT_OOM; + unlock_page(page); + goto out; + } + + /* + * Back out if somebody else already faulted in this pte. + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*page_table, orig_pte))) + goto out_nomap; + + if (unlikely(!PageUptodate(page))) { + ret = VM_FAULT_SIGBUS; + goto out_nomap; + } + + /* + * The page isn't present yet, go ahead with the fault. + * + * Be careful about the sequence of operations here. + * To get its accounting right, reuse_swap_page() must be called + * while the page is counted on swap but not yet in mapcount i.e. + * before page_add_anon_rmap() and swap_free(); try_to_free_swap() + * must be called after the swap_free(), or it will never succeed. + * Because delete_from_swap_page() may be called by reuse_swap_page(), + * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry + * in page->private. In this case, a record in swap_cgroup is silently + * discarded at swap_free(). + */ + + inc_mm_counter(mm, anon_rss); + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && reuse_swap_page(page)) { + pte = maybe_mkwrite(pte_mkdirty(pte), vma); + write_access = 0; + } + flush_icache_page(vma, page); + set_pte_at(mm, address, page_table, pte); + page_add_anon_rmap(page, vma, address); + /* It's better to call commit-charge after rmap is established */ + mem_cgroup_commit_charge_swapin(page, ptr); + + swap_free(entry); + if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + try_to_free_swap(page); + unlock_page(page); + + if (write_access) { + ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); + if (ret & VM_FAULT_ERROR) + ret &= VM_FAULT_ERROR; + goto out; + } + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); +unlock: + pte_unmap_unlock(page_table, ptl); +out: + return ret; +out_nomap: + mem_cgroup_cancel_charge_swapin(ptr); + pte_unmap_unlock(page_table, ptl); + unlock_page(page); + page_cache_release(page); + return ret; +} + +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) +{ + struct page *page; + spinlock_t *ptl; + pte_t entry; + + /* Allocate our own private page. */ + pte_unmap(page_table); + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_zeroed_user_highpage_movable(vma, address); + if (!page) + goto oom; + __SetPageUptodate(page); + + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) + goto oom_free_page; + + entry = mk_pte(page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte_none(*page_table)) + goto release; + inc_mm_counter(mm, anon_rss); + page_add_new_anon_rmap(page, vma, address); + set_pte_at(mm, address, page_table, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, entry); +unlock: + pte_unmap_unlock(page_table, ptl); + return 0; +release: + mem_cgroup_uncharge_page(page); + page_cache_release(page); + goto unlock; +oom_free_page: + page_cache_release(page); +oom: + return VM_FAULT_OOM; +} + +/* + * __do_fault() tries to create a new page mapping. It aggressively + * tries to share with existing pages, but makes a separate copy if + * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid + * the next page fault. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte neither mapped nor locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + pte_t *page_table; + spinlock_t *ptl; + struct page *page; + pte_t entry; + int anon = 0; + int charged = 0; + struct page *dirty_page = NULL; + struct vm_fault vmf; + int ret; + int page_mkwrite = 0; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = pgoff; + vmf.flags = flags; + vmf.page = NULL; + + ret = vma->vm_ops->fault(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + + /* + * For consistency in subsequent calls, make the faulted page always + * locked. + */ + if (unlikely(!(ret & VM_FAULT_LOCKED))) + lock_page(vmf.page); + else + VM_BUG_ON(!PageLocked(vmf.page)); + + /* + * Should we do an early C-O-W break? + */ + page = vmf.page; + if (flags & FAULT_FLAG_WRITE) { + if (!(vma->vm_flags & VM_SHARED)) { + anon = 1; + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto out; + } + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, address); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { + ret = VM_FAULT_OOM; + page_cache_release(page); + goto out; + } + charged = 1; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) + clear_page_mlock(vmf.page); + copy_user_highpage(page, vmf.page, address, vma); + __SetPageUptodate(page); + } else { + /* + * If the page will be shareable, see if the backing + * address space wants to know that the page is about + * to become writable + */ + if (vma->vm_ops->page_mkwrite) { + unlock_page(page); + if (vma->vm_ops->page_mkwrite(vma, page) < 0) { + ret = VM_FAULT_SIGBUS; + anon = 1; /* no anon but release vmf.page */ + goto out_unlocked; + } + lock_page(page); + /* + * XXX: this is not quite right (racy vs + * invalidate) to unlock and relock the page + * like this, however a better fix requires + * reworking page_mkwrite locking API, which + * is better done later. + */ + if (!page->mapping) { + ret = 0; + anon = 1; /* no anon but release vmf.page */ + goto out; + } + page_mkwrite = 1; + } + } + + } + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if write_access is true, we either now have + * an exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + /* Only go through if we didn't race with anybody else... */ + if (likely(pte_same(*page_table, orig_pte))) { + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (flags & FAULT_FLAG_WRITE) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (anon) { + inc_mm_counter(mm, anon_rss); + page_add_new_anon_rmap(page, vma, address); + } else { + inc_mm_counter(mm, file_rss); + page_add_file_rmap(page); + if (flags & FAULT_FLAG_WRITE) { + dirty_page = page; + get_page(dirty_page); + } + } + set_pte_at(mm, address, page_table, entry); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, entry); + } else { + if (charged) + mem_cgroup_uncharge_page(page); + if (anon) + page_cache_release(page); + else + anon = 1; /* no anon but release faulted_page */ + } + + pte_unmap_unlock(page_table, ptl); + +out: + unlock_page(vmf.page); +out_unlocked: + if (anon) + page_cache_release(vmf.page); + else if (dirty_page) { + if (vma->vm_file) + file_update_time(vma->vm_file); + + set_page_dirty_balance(dirty_page, page_mkwrite); + put_page(dirty_page); + } + + return ret; +} + +static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + pgoff_t pgoff = (((address & PAGE_MASK) + - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); + + pte_unmap(page_table); + return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); +} + +/* + * Fault of a previously existing named mapping. Repopulate the pte + * from the encoded file_pte if possible. This enables swappable + * nonlinear vmas. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + unsigned int flags = FAULT_FLAG_NONLINEAR | + (write_access ? FAULT_FLAG_WRITE : 0); + pgoff_t pgoff; + + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) + return 0; + + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { + /* + * Page table corrupted: show pte and kill process. + */ + print_bad_pte(vma, address, orig_pte, NULL); + return VM_FAULT_OOM; + } + + pgoff = pte_to_pgoff(orig_pte); + return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); +} + +/* + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static inline int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, int write_access) +{ + pte_t entry; + spinlock_t *ptl; + + entry = *pte; + if (!pte_present(entry)) { + if (pte_none(entry)) { + if (vma->vm_ops) { + if (likely(vma->vm_ops->fault)) + return do_linear_fault(mm, vma, address, + pte, pmd, write_access, entry); + } + return do_anonymous_page(mm, vma, address, + pte, pmd, write_access); + } + if (pte_file(entry)) + return do_nonlinear_fault(mm, vma, address, + pte, pmd, write_access, entry); + return do_swap_page(mm, vma, address, + pte, pmd, write_access, entry); + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (unlikely(!pte_same(*pte, entry))) + goto unlock; + if (write_access) { + if (!pte_write(entry)) + return do_wp_page(mm, vma, address, + pte, pmd, ptl, entry); + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { + update_mmu_cache(vma, address, entry); + } else { + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (write_access) + flush_tlb_page(vma, address); + } +unlock: + pte_unmap_unlock(pte, ptl); + return 0; +} + +/* + * By the time we get here, we already hold the mm semaphore + */ +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); + + if (unlikely(is_vm_hugetlb_page(vma))) + return hugetlb_fault(mm, vma, address, write_access); + + pgd = pgd_offset(mm, address); + pud = pud_alloc(mm, pgd, address); + if (!pud) + return VM_FAULT_OOM; + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + return VM_FAULT_OOM; + pte = pte_alloc_map(mm, pmd, address); + if (!pte) + return VM_FAULT_OOM; + + return handle_pte_fault(mm, vma, address, pte, pmd, write_access); +} + +#ifndef __PAGETABLE_PUD_FOLDED +/* + * Allocate page upper directory. + * We've already handled the fast-path in-line. + */ +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pud_t *new = pud_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&mm->page_table_lock); + if (pgd_present(*pgd)) /* Another has populated it */ + pud_free(mm, new); + else + pgd_populate(mm, pgd, new); + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* __PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_PMD_FOLDED +/* + * Allocate page middle directory. + * We've already handled the fast-path in-line. + */ +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + pmd_t *new = pmd_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&mm->page_table_lock); +#ifndef __ARCH_HAS_4LEVEL_HACK + if (pud_present(*pud)) /* Another has populated it */ + pmd_free(mm, new); + else + pud_populate(mm, pud, new); +#else + if (pgd_present(*pud)) /* Another has populated it */ + pmd_free(mm, new); + else + pgd_populate(mm, pud, new); +#endif /* __ARCH_HAS_4LEVEL_HACK */ + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* __PAGETABLE_PMD_FOLDED */ + +int make_pages_present(unsigned long addr, unsigned long end) +{ + int ret, len, write; + struct vm_area_struct * vma; + + vma = find_vma(current->mm, addr); + if (!vma) + return -ENOMEM; + write = (vma->vm_flags & VM_WRITE) != 0; + BUG_ON(addr >= end); + BUG_ON(end > vma->vm_end); + len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; + ret = get_user_pages(current, current->mm, addr, + len, write, 0, NULL, NULL); + if (ret < 0) + return ret; + return ret == len ? 0 : -EFAULT; +} + +#if !defined(__HAVE_ARCH_GATE_AREA) + +#if defined(AT_SYSINFO_EHDR) +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; +} +__initcall(gate_vma_init); +#endif + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ +#ifdef AT_SYSINFO_EHDR + return &gate_vma; +#else + return NULL; +#endif +} + +int in_gate_area_no_task(unsigned long addr) +{ +#ifdef AT_SYSINFO_EHDR + if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) + return 1; +#endif + return 0; +} + +#endif /* __HAVE_ARCH_GATE_AREA */ + +#ifdef CONFIG_HAVE_IOREMAP_PROT +int follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot, resource_size_t *phys) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + resource_size_t phys_addr = 0; + struct mm_struct *mm = vma->vm_mm; + int ret = -EINVAL; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out; + + /* We cannot handle huge page PFN maps. Luckily they don't exist. */ + if (pmd_huge(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!ptep) + goto out; + + pte = *ptep; + if (!pte_present(pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + phys_addr = pte_pfn(pte); + phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ + + *prot = pgprot_val(pte_pgprot(pte)); + *phys = phys_addr; + ret = 0; + +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return ret; +} + +int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + resource_size_t phys_addr; + unsigned long prot = 0; + void __iomem *maddr; + int offset = addr & (PAGE_SIZE-1); + + if (follow_phys(vma, addr, write, &prot, &phys_addr)) + return -EINVAL; + + maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); + if (write) + memcpy_toio(maddr + offset, buf, len); + else + memcpy_fromio(buf, maddr + offset, len); + iounmap(maddr); + + return len; +} +#endif + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + void *old_buf = buf; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was successfully transferred */ + while (len) { + int bytes, ret, offset; + void *maddr; + struct page *page = NULL; + + ret = get_user_pages(tsk, mm, addr, 1, + write, 1, &page, &vma); + if (ret <= 0) { + /* + * Check if this is a VM_IO | VM_PFNMAP VMA, which + * we can access using slightly different code. + */ +#ifdef CONFIG_HAVE_IOREMAP_PROT + vma = find_vma(mm, addr); + if (!vma) + break; + if (vma->vm_ops && vma->vm_ops->access) + ret = vma->vm_ops->access(vma, addr, buf, + len, write); + if (ret <= 0) +#endif + break; + bytes = ret; + } else { + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); + } + len -= bytes; + buf += bytes; + addr += bytes; + } + up_read(&mm->mmap_sem); + mmput(mm); + + return buf - old_buf; +} + +/* + * Print the name of a VMA. + */ +void print_vma_addr(char *prefix, unsigned long ip) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + /* + * Do not print if we are in atomic + * contexts (in exception stacks, etc.): + */ + if (preempt_count()) + return; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); + if (vma && vma->vm_file) { + struct file *f = vma->vm_file; + char *buf = (char *)__get_free_page(GFP_KERNEL); + if (buf) { + char *p, *s; + + p = d_path(&f->f_path, buf, PAGE_SIZE); + if (IS_ERR(p)) + p = "?"; + s = strrchr(p, '/'); + if (s) + p = s+1; + printk("%s%s[%lx+%lx]", prefix, p, + vma->vm_start, + vma->vm_end - vma->vm_start); + free_page((unsigned long)buf); + } + } + up_read(¤t->mm->mmap_sem); +} + +#ifdef CONFIG_PROVE_LOCKING +void might_fault(void) +{ + /* + * Some code (nfs/sunrpc) uses socket ops on kernel memory while + * holding the mmap_sem, this is safe because kernel memory doesn't + * get paged out, therefore we'll never actually fault, and the + * below annotations will generate false positives. + */ + if (segment_eq(get_fs(), KERNEL_DS)) + return; + + might_sleep(); + /* + * it would be nicer only to annotate paths which are not under + * pagefault_disable, however that requires a larger audit and + * providing helpers like get_user_atomic. + */ + if (!in_atomic() && current->mm) + might_lock_read(¤t->mm->mmap_sem); +} +EXPORT_SYMBOL(might_fault); +#endif +#endif /* DDE_LINUX */ diff --git a/libdde-linux26/lib/src/mm/page-writeback.c b/libdde-linux26/lib/src/mm/page-writeback.c new file mode 100644 index 00000000..8a325e2a --- /dev/null +++ b/libdde-linux26/lib/src/mm/page-writeback.c @@ -0,0 +1,1468 @@ +/* + * mm/page-writeback.c + * + * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * Contains functions related to writing back dirty pages at the + * address_space level. + * + * 10Apr2002 Andrew Morton + * Initial version + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/init.h> +#include <linux/backing-dev.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/blkdev.h> +#include <linux/mpage.h> +#include <linux/rmap.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/smp.h> +#include <linux/sysctl.h> +#include <linux/cpu.h> +#include <linux/syscalls.h> +#include <linux/buffer_head.h> +#include <linux/pagevec.h> + +/* + * The maximum number of pages to writeout in a single bdflush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024 + +/* + * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited + * will look to see if it needs to force writeback or throttling. + */ +static long ratelimit_pages = 32; + +/* + * When balance_dirty_pages decides that the caller needs to perform some + * non-background writeback, this is how many pages it will attempt to write. + * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * large amounts of I/O are submitted. + */ +static inline long sync_writeback_pages(void) +{ + return ratelimit_pages + ratelimit_pages / 2; +} + +/* The following parameters are exported via /proc/sys/vm */ + +/* + * Start background writeback (via pdflush) at this percentage + */ +int dirty_background_ratio = 5; + +/* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of + * dirty_background_ratio * the amount of dirtyable memory + */ +unsigned long dirty_background_bytes; + +/* + * free highmem will not be subtracted from the total free memory + * for calculating free ratios if vm_highmem_is_dirtyable is true + */ +int vm_highmem_is_dirtyable; + +/* + * The generator of dirty data starts writeback at this percentage + */ +int vm_dirty_ratio = 10; + +/* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of + * vm_dirty_ratio * the amount of dirtyable memory + */ +unsigned long vm_dirty_bytes; + +/* + * The interval between `kupdate'-style writebacks, in jiffies + */ +#ifndef DDE_LINUX +int dirty_writeback_interval = 5 * HZ; +#else +int dirty_writeback_interval = 1250; +#endif + +#ifndef DDE_LINUX +/* + * The longest number of jiffies for which data is allowed to remain dirty + */ +int dirty_expire_interval = 30 * HZ; +#else +int dirty_expire_interval = 7500; +#endif + +/* + * Flag that makes the machine dump writes/reads and block dirtyings. + */ +int block_dump; + +/* + * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: + * a full sync is triggered after this time elapses without any disk activity. + */ +int laptop_mode; + +EXPORT_SYMBOL(laptop_mode); + +/* End of sysctl-exported parameters */ + + +static void background_writeout(unsigned long _min_pages); + +/* + * Scale the writeback cache size proportional to the relative writeout speeds. + * + * We do this by keeping a floating proportion between BDIs, based on page + * writeback completions [end_page_writeback()]. Those devices that write out + * pages fastest will get the larger share, while the slower will get a smaller + * share. + * + * We use page writeout completions because we are interested in getting rid of + * dirty pages. Having them written out is the primary goal. + * + * We introduce a concept of time, a period over which we measure these events, + * because demand can/will vary over time. The length of this period itself is + * measured in page writeback completions. + * + */ +static struct prop_descriptor vm_completions; +static struct prop_descriptor vm_dirties; + +/* + * couple the period to the dirty_ratio: + * + * period/2 ~ roundup_pow_of_two(dirty limit) + */ +static int calc_period_shift(void) +{ + unsigned long dirty_total; + + if (vm_dirty_bytes) + dirty_total = vm_dirty_bytes / PAGE_SIZE; + else + dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / + 100; + return 2 + ilog2(dirty_total - 1); +} + +/* + * update the period when the dirty threshold changes. + */ +static void update_completion_period(void) +{ + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); + prop_change_shift(&vm_dirties, shift); +} + +int dirty_background_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_bytes = 0; + return ret; +} + +int dirty_background_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_ratio = 0; + return ret; +} + +int dirty_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_ratio = vm_dirty_ratio; + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_ratio != old_ratio) { + update_completion_period(); + vm_dirty_bytes = 0; + } + return ret; +} + + +int dirty_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned long old_bytes = vm_dirty_bytes; + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + update_completion_period(); + vm_dirty_ratio = 0; + } + return ret; +} + +/* + * Increment the BDI's writeout completion count and the global writeout + * completion count. Called from test_clear_page_writeback(). + */ +static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) +{ + __prop_inc_percpu_max(&vm_completions, &bdi->completions, + bdi->max_prop_frac); +} + +void bdi_writeout_inc(struct backing_dev_info *bdi) +{ + unsigned long flags; + + local_irq_save(flags); + __bdi_writeout_inc(bdi); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(bdi_writeout_inc); + +void task_dirty_inc(struct task_struct *tsk) +{ + prop_inc_single(&vm_dirties, &tsk->dirties); +} + +/* + * Obtain an accurate fraction of the BDI's portion. + */ +static void bdi_writeout_fraction(struct backing_dev_info *bdi, + long *numerator, long *denominator) +{ + if (bdi_cap_writeback_dirty(bdi)) { + prop_fraction_percpu(&vm_completions, &bdi->completions, + numerator, denominator); + } else { + *numerator = 0; + *denominator = 1; + } +} + +/* + * Clip the earned share of dirty pages to that which is actually available. + * This avoids exceeding the total dirty_limit when the floating averages + * fluctuate too quickly. + */ +static void +clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) +{ + long avail_dirty; + + avail_dirty = dirty - + (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_WRITEBACK) + + global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK_TEMP)); + + if (avail_dirty < 0) + avail_dirty = 0; + + avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + + bdi_stat(bdi, BDI_WRITEBACK); + + *pbdi_dirty = min(*pbdi_dirty, avail_dirty); +} + +static inline void task_dirties_fraction(struct task_struct *tsk, + long *numerator, long *denominator) +{ + prop_fraction_single(&vm_dirties, &tsk->dirties, + numerator, denominator); +} + +/* + * scale the dirty limit + * + * task specific dirty limit: + * + * dirty -= (dirty/8) * p_{t} + */ +static void task_dirty_limit(struct task_struct *tsk, long *pdirty) +{ + long numerator, denominator; + long dirty = *pdirty; + u64 inv = dirty >> 3; + + task_dirties_fraction(tsk, &numerator, &denominator); + inv *= numerator; + do_div(inv, denominator); + + dirty -= inv; + if (dirty < *pdirty/2) + dirty = *pdirty/2; + + *pdirty = dirty; +} + +/* + * + */ +static DEFINE_SPINLOCK(bdi_lock); +static unsigned int bdi_min_ratio; + +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&bdi_lock, flags); + if (min_ratio > bdi->max_ratio) { + ret = -EINVAL; + } else { + min_ratio -= bdi->min_ratio; + if (bdi_min_ratio + min_ratio < 100) { + bdi_min_ratio += min_ratio; + bdi->min_ratio += min_ratio; + } else { + ret = -EINVAL; + } + } + spin_unlock_irqrestore(&bdi_lock, flags); + + return ret; +} + +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) +{ + unsigned long flags; + int ret = 0; + + if (max_ratio > 100) + return -EINVAL; + + spin_lock_irqsave(&bdi_lock, flags); + if (bdi->min_ratio > max_ratio) { + ret = -EINVAL; + } else { + bdi->max_ratio = max_ratio; + bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; + } + spin_unlock_irqrestore(&bdi_lock, flags); + + return ret; +} +EXPORT_SYMBOL(bdi_set_max_ratio); + +/* + * Work out the current dirty-memory clamping and background writeout + * thresholds. + * + * The main aim here is to lower them aggressively if there is a lot of mapped + * memory around. To avoid stressing page reclaim with lots of unreclaimable + * pages. It is better to clamp down on writers than to start swapping, and + * performing lots of scanning. + * + * We only allow 1/2 of the currently-unmapped memory to be dirtied. + * + * We don't permit the clamping level to fall below 5% - that is getting rather + * excessive. + * + * We make sure that the background writeout level is below the adjusted + * clamping level. + */ + +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_node_state(node, N_HIGH_MEMORY) { + struct zone *z = + &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); + } + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +unsigned long determine_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + + return x + 1; /* Ensure that we never return 0 */ +} + +void +get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, + unsigned long *pbdi_dirty, struct backing_dev_info *bdi) +{ + unsigned long background; + unsigned long dirty; + unsigned long available_memory = determine_dirtyable_memory(); + struct task_struct *tsk; + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else { + int dirty_ratio; + + dirty_ratio = vm_dirty_ratio; + if (dirty_ratio < 5) + dirty_ratio = 5; + dirty = (dirty_ratio * available_memory) / 100; + } + + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; + + if (background >= dirty) + background = dirty / 2; + tsk = current; + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + background += background / 4; + dirty += dirty / 4; + } + *pbackground = background; + *pdirty = dirty; + + if (bdi) { + u64 bdi_dirty; + long numerator, denominator; + + /* + * Calculate this BDI's share of the dirty ratio. + */ + bdi_writeout_fraction(bdi, &numerator, &denominator); + + bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; + bdi_dirty *= numerator; + do_div(bdi_dirty, denominator); + bdi_dirty += (dirty * bdi->min_ratio) / 100; + if (bdi_dirty > (dirty * bdi->max_ratio) / 100) + bdi_dirty = dirty * bdi->max_ratio / 100; + + *pbdi_dirty = bdi_dirty; + clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); + task_dirty_limit(current, pbdi_dirty); + } +} + +/* + * balance_dirty_pages() must be called by processes which are generating dirty + * data. It looks at the number of dirty pages in the machine and will force + * the caller to perform writeback if the system is over `vm_dirty_ratio'. + * If we're over `background_thresh' then pdflush is woken to perform some + * writeout. + */ +static void balance_dirty_pages(struct address_space *mapping) +{ + long nr_reclaimable, bdi_nr_reclaimable; + long nr_writeback, bdi_nr_writeback; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; + unsigned long pages_written = 0; + unsigned long write_chunk = sync_writeback_pages(); + + struct backing_dev_info *bdi = mapping->backing_dev_info; + + for (;;) { + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = write_chunk, + .range_cyclic = 1, + }; + + get_dirty_limits(&background_thresh, &dirty_thresh, + &bdi_thresh, bdi); + + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + nr_writeback = global_page_state(NR_WRITEBACK); + + bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) + break; + + /* + * Throttle it only when the background writeback cannot + * catch-up. This avoids (excessively) small writeouts + * when the bdi limits are ramping up. + */ + if (nr_reclaimable + nr_writeback < + (background_thresh + dirty_thresh) / 2) + break; + + if (!bdi->dirty_exceeded) + bdi->dirty_exceeded = 1; + + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + */ + if (bdi_nr_reclaimable) { + writeback_inodes(&wbc); + pages_written += write_chunk - wbc.nr_to_write; + get_dirty_limits(&background_thresh, &dirty_thresh, + &bdi_thresh, bdi); + } + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (bdi_thresh < 2*bdi_stat_error(bdi)) { + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); + } else if (bdi_nr_reclaimable) { + bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + } + + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) + break; + if (pages_written >= write_chunk) + break; /* We've done our duty */ + + congestion_wait(WRITE, HZ/10); + } + + if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && + bdi->dirty_exceeded) + bdi->dirty_exceeded = 0; + + if (writeback_in_progress(bdi)) + return; /* pdflush is already working this queue */ + + /* + * In laptop mode, we wait until hitting the higher threshold before + * starting background writeout, and then write out all the way down + * to the lower threshold. So slow writers cause minimal disk activity. + * + * In normal mode, we start background writeout at the lower + * background_thresh, to keep the amount of dirty memory low. + */ + if ((laptop_mode && pages_written) || + (!laptop_mode && (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + > background_thresh))) + pdflush_operation(background_writeout, 0); +} + +void set_page_dirty_balance(struct page *page, int page_mkwrite) +{ + if (set_page_dirty(page) || page_mkwrite) { + struct address_space *mapping = page_mapping(page); + + if (mapping) + balance_dirty_pages_ratelimited(mapping); + } +} + +/** + * balance_dirty_pages_ratelimited_nr - balance dirty memory state + * @mapping: address_space which was dirtied + * @nr_pages_dirtied: number of pages which the caller has just dirtied + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * On really big machines, get_writeback_state is expensive, so try to avoid + * calling it too often (ratelimiting). But once we're over the dirty memory + * limit we decrease the ratelimiting by a lot, to prevent individual processes + * from overshooting the limit by (ratelimit_pages) each. + */ +void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, + unsigned long nr_pages_dirtied) +{ + static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; + unsigned long ratelimit; + unsigned long *p; + + ratelimit = ratelimit_pages; + if (mapping->backing_dev_info->dirty_exceeded) + ratelimit = 8; + + /* + * Check the rate limiting. Also, we do not want to throttle real-time + * tasks in balance_dirty_pages(). Period. + */ + preempt_disable(); + p = &__get_cpu_var(ratelimits); + *p += nr_pages_dirtied; + if (unlikely(*p >= ratelimit)) { + *p = 0; + preempt_enable(); + balance_dirty_pages(mapping); + return; + } + preempt_enable(); +} +EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); + +void throttle_vm_writeout(gfp_t gfp_mask) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + + for ( ; ; ) { + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + + /* + * Boost the allowable dirty threshold a bit for page + * allocators so they don't get DoS'ed by heavy writers + */ + dirty_thresh += dirty_thresh / 10; /* wheeee... */ + + if (global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK) <= dirty_thresh) + break; + congestion_wait(WRITE, HZ/10); + + /* + * The caller might hold locks which can prevent IO completion + * or progress in the filesystem. So we cannot just sit here + * waiting for IO to complete. + */ + if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) + break; + } +} + +/* + * writeback at least _min_pages, and keep writing until the amount of dirty + * memory is less than the background threshold, or until we're all clean. + */ +static void background_writeout(unsigned long _min_pages) +{ + long min_pages = _min_pages; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = 0, + .nonblocking = 1, + .range_cyclic = 1, + }; + + for ( ; ; ) { + unsigned long background_thresh; + unsigned long dirty_thresh; + + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) < background_thresh + && min_pages <= 0) + break; + wbc.more_io = 0; + wbc.encountered_congestion = 0; + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.pages_skipped = 0; + writeback_inodes(&wbc); + min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { + /* Wrote less than expected */ + if (wbc.encountered_congestion || wbc.more_io) + congestion_wait(WRITE, HZ/10); + else + break; + } + } +} + +/* + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. Returns 0 if a pdflush thread was dispatched. Returns + * -1 if all pdflush threads were busy. + */ +int wakeup_pdflush(long nr_pages) +{ + if (nr_pages == 0) + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + return pdflush_operation(background_writeout, nr_pages); +} + +#ifndef DDE_LINUX +static void wb_timer_fn(unsigned long unused); +static void laptop_timer_fn(unsigned long unused); + +static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); +static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); + +/* + * Periodic writeback of "old" data. + * + * Define "old": the first time one of an inode's pages is dirtied, we mark the + * dirtying-time in the inode's address_space. So this periodic writeback code + * just walks the superblock inode list, writing back any inodes which are + * older than a specific point in time. + * + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a + * one-second gap. + * + * older_than_this takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. + */ +static void wb_kupdate(unsigned long arg) +{ + unsigned long oldest_jif; + unsigned long start_jif; + unsigned long next_jif; + long nr_to_write; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = WB_SYNC_NONE, + .older_than_this = &oldest_jif, + .nr_to_write = 0, + .nonblocking = 1, + .for_kupdate = 1, + .range_cyclic = 1, + }; + + sync_supers(); + + oldest_jif = jiffies - dirty_expire_interval; + start_jif = jiffies; + next_jif = start_jif + dirty_writeback_interval; + nr_to_write = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + while (nr_to_write > 0) { + wbc.more_io = 0; + wbc.encountered_congestion = 0; + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + writeback_inodes(&wbc); + if (wbc.nr_to_write > 0) { + if (wbc.encountered_congestion || wbc.more_io) + congestion_wait(WRITE, HZ/10); + else + break; /* All the old data is written */ + } + nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + } + if (time_before(next_jif, jiffies + HZ)) + next_jif = jiffies + HZ; + if (dirty_writeback_interval) + mod_timer(&wb_timer, next_jif); +} + +/* + * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs + */ +int dirty_writeback_centisecs_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + if (dirty_writeback_interval) + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + else + del_timer(&wb_timer); + return 0; +} + +static void wb_timer_fn(unsigned long unused) +{ + if (pdflush_operation(wb_kupdate, 0) < 0) + mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ +} + +static void laptop_flush(unsigned long unused) +{ + sys_sync(); +} + +static void laptop_timer_fn(unsigned long unused) +{ + pdflush_operation(laptop_flush, 0); +} + +/* + * We've spun up the disk and we're in laptop mode: schedule writeback + * of all dirty data a few seconds from now. If the flush is already scheduled + * then push it back - the user is still using the disk. + */ +void laptop_io_completion(void) +{ + mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); +} + +/* + * We're in laptop mode and we've just synced. The sync's writes will have + * caused another writeback to be scheduled by laptop_io_completion. + * Nothing needs to be written back anymore, so we unschedule the writeback. + */ +void laptop_sync_completion(void) +{ + del_timer(&laptop_mode_wb_timer); +} +#endif + +/* + * If ratelimit_pages is too high then we can get into dirty-data overload + * if a large number of processes all perform writes at the same time. + * If it is too low then SMP machines will call the (expensive) + * get_writeback_state too often. + * + * Here we set ratelimit_pages to a level which ensures that when all CPUs are + * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory + * thresholds before writeback cuts in. + * + * But the limit should not be set too high. Because it also controls the + * amount of memory which the balance_dirty_pages() caller has to write back. + * If this is too large then the caller will block on the IO queue all the + * time. So limit it to four megabytes - the balance_dirty_pages() caller + * will write six megabyte chunks, max. + */ + +void writeback_set_ratelimit(void) +{ + ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); + if (ratelimit_pages < 16) + ratelimit_pages = 16; + if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) + ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; +} + +static int __cpuinit +ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) +{ + writeback_set_ratelimit(); + return NOTIFY_DONE; +} + +static struct notifier_block __cpuinitdata ratelimit_nb = { + .notifier_call = ratelimit_handler, + .next = NULL, +}; + +/* + * Called early on to tune the page writeback dirty limits. + * + * We used to scale dirty pages according to how total memory + * related to pages that could be allocated for buffers (by + * comparing nr_free_buffer_pages() to vm_total_pages. + * + * However, that was when we used "dirty_ratio" to scale with + * all memory, and we don't do that any more. "dirty_ratio" + * is now applied to total non-HIGHPAGE memory (by subtracting + * totalhigh_pages from vm_total_pages), and as such we can't + * get into the old insane situation any more where we had + * large amounts of dirty pages compared to a small amount of + * non-HIGHMEM memory. + * + * But we might still want to scale the dirty_ratio by how + * much memory the box has.. + */ +void __init page_writeback_init(void) +{ + int shift; + +#ifndef DDE_LINUX + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); +#endif + writeback_set_ratelimit(); + register_cpu_notifier(&ratelimit_nb); + + shift = calc_period_shift(); + prop_descriptor_init(&vm_completions, shift); + prop_descriptor_init(&vm_dirties, shift); +} + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +int write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t uninitialized_var(writeback_index); + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int cycled; + int range_whole = 0; + long nr_to_write = wbc->nr_to_write; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + cycled = 1; /* ignore range_cyclic tests */ + } +retry: + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + done = 1; + break; + } + + done_index = page->index + 1; + + lock_page(page); + + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = (*writepage)(page, wbc, data); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done = 1; + break; + } + } + + if (nr_to_write > 0) { + nr_to_write--; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + done = 1; + break; + } + } + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!cycled && !done) { + /* + * range_cyclic: + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + cycled = 1; + index = 0; + end = writeback_index - 1; + goto retry; + } + if (!wbc->no_nrwrite_index_update) { + if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) + mapping->writeback_index = done_index; + wbc->nr_to_write = nr_to_write; + } + + return ret; +} +EXPORT_SYMBOL(write_cache_pages); + +#ifndef DDE_LINUX +/* + * Function used by generic_writepages to call the real writepage + * function and set the mapping flags on error + */ +static int __writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + +/** + * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * + * This is a library function, which implements the writepages() + * address_space_operation. + */ +int generic_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; + + return write_cache_pages(mapping, wbc, __writepage, mapping); +} + +EXPORT_SYMBOL(generic_writepages); + +int do_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + int ret; + + if (wbc->nr_to_write <= 0) + return 0; + wbc->for_writepages = 1; + if (mapping->a_ops->writepages) + ret = mapping->a_ops->writepages(mapping, wbc); + else + ret = generic_writepages(mapping, wbc); + wbc->for_writepages = 0; + return ret; +} + +/** + * write_one_page - write out a single page and optionally wait on I/O + * @page: the page to write + * @wait: if true, wait on writeout + * + * The page must be locked by the caller and will be unlocked upon return. + * + * write_one_page() returns a negative error code if I/O failed. + */ +int write_one_page(struct page *page, int wait) +{ + struct address_space *mapping = page->mapping; + int ret = 0; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + }; + + BUG_ON(!PageLocked(page)); + + if (wait) + wait_on_page_writeback(page); + + if (clear_page_dirty_for_io(page)) { + page_cache_get(page); + ret = mapping->a_ops->writepage(page, &wbc); + if (ret == 0 && wait) { + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + } + page_cache_release(page); + } else { + unlock_page(page); + } + return ret; +} +EXPORT_SYMBOL(write_one_page); + +/* + * For address_spaces which do not use buffers nor write back. + */ +int __set_page_dirty_no_writeback(struct page *page) +{ + if (!PageDirty(page)) + SetPageDirty(page); + return 0; +} + +/* + * For address_spaces which do not use buffers. Just tag the page as dirty in + * its radix tree. + * + * This is also used when a single buffer is being dirtied: we want to set the + * page dirty in that case, but not all the buffers. This is a "bottom-up" + * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * + * Most callers have locked the page, which pins the address_space in memory. + * But zap_pte_range() does not lock the page, however in that case the + * mapping is pinned by the vma's ->vm_file reference. + * + * We take care to handle the case where the page was truncated from the + * mapping by re-checking page_mapping() inside tree_lock. + */ +int __set_page_dirty_nobuffers(struct page *page) +{ + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); + struct address_space *mapping2; + + if (!mapping) + return 1; + + spin_lock_irq(&mapping->tree_lock); + mapping2 = page_mapping(page); + if (mapping2) { /* Race with truncate? */ + BUG_ON(mapping2 != mapping); + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); + task_dirty_inc(current); + task_io_account_write(PAGE_CACHE_SIZE); + } + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&mapping->tree_lock); + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + return 1; + } + return 0; +} +EXPORT_SYMBOL(__set_page_dirty_nobuffers); + +/* + * When a writepage implementation decides that it doesn't want to write this + * page for some reason, it should redirty the locked page via + * redirty_page_for_writepage() and it should then unlock the page and return 0 + */ +int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) +{ + wbc->pages_skipped++; + return __set_page_dirty_nobuffers(page); +} +EXPORT_SYMBOL(redirty_page_for_writepage); + +/* + * If the mapping doesn't provide a set_page_dirty a_op, then + * just fall through and assume that it wants buffer_heads. + */ +int set_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (likely(mapping)) { + int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; +#ifdef CONFIG_BLOCK + if (!spd) + spd = __set_page_dirty_buffers; +#endif + return (*spd)(page); + } + if (!PageDirty(page)) { + if (!TestSetPageDirty(page)) + return 1; + } + return 0; +} +EXPORT_SYMBOL(set_page_dirty); + +/* + * set_page_dirty() is racy if the caller has no reference against + * page->mapping->host, and if the page is unlocked. This is because another + * CPU could truncate the page off the mapping and then free the mapping. + * + * Usually, the page _is_ locked, or the caller is a user-space process which + * holds a reference on the inode by having an open file. + * + * In other cases, the page should be locked before running set_page_dirty(). + */ +int set_page_dirty_lock(struct page *page) +{ + int ret; + + lock_page_nosync(page); + ret = set_page_dirty(page); + unlock_page(page); + return ret; +} +EXPORT_SYMBOL(set_page_dirty_lock); +#endif + +/* + * Clear a page's dirty flag, while caring for dirty memory accounting. + * Returns true if the page was previously dirty. + * + * This is for preparing to put the page under writeout. We leave the page + * tagged as dirty in the radix tree so that a concurrent write-for-sync + * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage + * implementation will run either set_page_writeback() or set_page_dirty(), + * at which stage we bring the page's dirty flag and radix-tree dirty tag + * back into sync. + * + * This incoherency between the page's dirty flag and radix-tree tag is + * unfortunate, but it only exists while the page is locked. + */ +int clear_page_dirty_for_io(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + BUG_ON(!PageLocked(page)); + + ClearPageReclaim(page); + if (mapping && mapping_cap_account_dirty(mapping)) { + /* + * Yes, Virginia, this is indeed insane. + * + * We use this sequence to make sure that + * (a) we account for dirty stats properly + * (b) we tell the low-level filesystem to + * mark the whole page dirty if it was + * dirty in a pagetable. Only to then + * (c) clean the page again and return 1 to + * cause the writeback. + * + * This way we avoid all nasty races with the + * dirty bit in multiple places and clearing + * them concurrently from different threads. + * + * Note! Normally the "set_page_dirty(page)" + * has no effect on the actual dirty bit - since + * that will already usually be set. But we + * need the side effects, and it can help us + * avoid races. + * + * We basically use the page "master dirty bit" + * as a serialization point for all the different + * threads doing their things. + */ + if (page_mkclean(page)) + set_page_dirty(page); + /* + * We carefully synchronise fault handlers against + * installing a dirty pte and marking the page dirty + * at this point. We do this by having them hold the + * page lock at some point after installing their + * pte, but before marking the page dirty. + * Pages are always locked coming in here, so we get + * the desired exclusion. See mm/memory.c:do_wp_page() + * for more comments. + */ + if (TestClearPageDirty(page)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); + return 1; + } + return 0; + } + return TestClearPageDirty(page); +} +EXPORT_SYMBOL(clear_page_dirty_for_io); + +int test_clear_page_writeback(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int ret; + + if (mapping) { + struct backing_dev_info *bdi = mapping->backing_dev_info; + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = TestClearPageWriteback(page); + if (ret) { + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_account_writeback(bdi)) { + __dec_bdi_stat(bdi, BDI_WRITEBACK); + __bdi_writeout_inc(bdi); + } + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestClearPageWriteback(page); + } + if (ret) + dec_zone_page_state(page, NR_WRITEBACK); + return ret; +} + +#ifndef DDE_LINUX +int test_set_page_writeback(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int ret; + + if (mapping) { + struct backing_dev_info *bdi = mapping->backing_dev_info; + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = TestSetPageWriteback(page); + if (!ret) { + radix_tree_tag_set(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_account_writeback(bdi)) + __inc_bdi_stat(bdi, BDI_WRITEBACK); + } + if (!PageDirty(page)) + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestSetPageWriteback(page); + } + if (!ret) + inc_zone_page_state(page, NR_WRITEBACK); + return ret; + +} +EXPORT_SYMBOL(test_set_page_writeback); +#endif /* DDE_LINUX */ + +/* + * Return true if any of the pages in the mapping are marked with the + * passed tag. + */ +int mapping_tagged(struct address_space *mapping, int tag) +{ + int ret; + rcu_read_lock(); + ret = radix_tree_tagged(&mapping->page_tree, tag); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(mapping_tagged); diff --git a/libdde-linux26/lib/src/net/core/dev.c b/libdde-linux26/lib/src/net/core/dev.c new file mode 100644 index 00000000..cf036525 --- /dev/null +++ b/libdde-linux26/lib/src/net/core/dev.c @@ -0,0 +1,5286 @@ +/* + * NET3 Protocol independent device support routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from the non IP parts of dev.c 1.0.19 + * Authors: Ross Biro + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * + * Additional Authors: + * Florian la Roche <rzsfl@rz.uni-sb.de> + * Alan Cox <gw4pts@gw4pts.ampr.org> + * David Hinds <dahinds@users.sourceforge.net> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Adam Sulmicki <adam@cfar.umd.edu> + * Pekka Riikonen <priikone@poesidon.pspt.fi> + * + * Changes: + * D.J. Barrow : Fixed bug where dev->refcnt gets set + * to 2 if register_netdev gets called + * before net_dev_init & also removed a + * few lines of code in the process. + * Alan Cox : device private ioctl copies fields back. + * Alan Cox : Transmit queue code does relevant + * stunts to keep the queue safe. + * Alan Cox : Fixed double lock. + * Alan Cox : Fixed promisc NULL pointer trap + * ???????? : Support the full private ioctl range + * Alan Cox : Moved ioctl permission check into + * drivers + * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI + * Alan Cox : 100 backlog just doesn't cut it when + * you start doing multicast video 8) + * Alan Cox : Rewrote net_bh and list manager. + * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Took out transmit every packet pass + * Saved a few bytes in the ioctl handler + * Alan Cox : Network driver sets packet type before + * calling netif_rx. Saves a function + * call a packet. + * Alan Cox : Hashed net_bh() + * Richard Kooijman: Timestamp fixes. + * Alan Cox : Wrong field in SIOCGIFDSTADDR + * Alan Cox : Device lock protection. + * Alan Cox : Fixed nasty side effect of device close + * changes. + * Rudi Cilibrasi : Pass the right thing to + * set_mac_address() + * Dave Miller : 32bit quantity for the device lock to + * make it work out on a Sparc. + * Bjorn Ekwall : Added KERNELD hack. + * Alan Cox : Cleaned up the backlog initialise. + * Craig Metz : SIOCGIFCONF fix if space for under + * 1 device. + * Thomas Bogendoerfer : Return ENODEV for dev_open, if there + * is no device open function. + * Andi Kleen : Fix error reporting for SIOCGIFCONF + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF + * Cyrus Durgin : Cleaned for KMOD + * Adam Sulmicki : Bug Fix : Network Device Unload + * A network device unload needs to purge + * the backlog queue. + * Paul Rusty Russell : SIOCSIFNAME + * Pekka Riikonen : Netdev boot-time settings code + * Andrew Morton : Make unregister_netdevice wait + * indefinitely on dev->refcnt + * J Hadi Salim : - Backlog queue sampling + * - netif_rx() feedback + */ + +#ifdef DDE_LINUX +#include "local.h" +#include <dde26_net.h> +#endif + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/capability.h> +#include <linux/cpu.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mutex.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/notifier.h> +#include <linux/skbuff.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/stat.h> +#include <linux/if_bridge.h> +#include <linux/if_macvlan.h> +#include <net/dst.h> +#include <net/pkt_sched.h> +#include <net/checksum.h> +#include <linux/highmem.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/netpoll.h> +#include <linux/rcupdate.h> +#include <linux/delay.h> +#include <net/wext.h> +#include <net/iw_handler.h> +#include <asm/current.h> +#include <linux/audit.h> +#include <linux/dmaengine.h> +#include <linux/err.h> +#include <linux/ctype.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/jhash.h> +#include <linux/random.h> + +#include "net-sysfs.h" + +#include <ddekit/timer.h> + +/* Instead of increasing this, you should create a hash table. */ +#define MAX_GRO_SKBS 8 + +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + +/* + * The list of packet types we will receive (as opposed to discard) + * and the routines to invoke. + * + * Why 16. Because with 16 the only overlap we get on a hash of the + * low nibble of the protocol value is RARP/SNAP/X.25. + * + * NOTE: That is no longer true with the addition of VLAN tags. Not + * sure which should go first, but I bet it won't make much + * difference if we are running VLANs. The good news is that + * this protocol won't be in the list unless compiled in, so + * the average user (w/out VLANs) will not be adversely affected. + * --BLG + * + * 0800 IP + * 8100 802.1Q VLAN + * 0001 802.3 + * 0002 AX.25 + * 0004 802.2 + * 8035 RARP + * 0005 SNAP + * 0805 X.25 + * 0806 ARP + * 8137 IPX + * 0009 Localtalk + * 86DD IPv6 + */ + +#define PTYPE_HASH_SIZE (16) +#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) + +static DEFINE_SPINLOCK(ptype_lock); +static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +static struct list_head ptype_all __read_mostly; /* Taps */ + +/* + * The @dev_base_head list is protected by @dev_base_lock and the rtnl + * semaphore. + * + * Pure readers hold dev_base_lock for reading. + * + * Writers must hold the rtnl semaphore while they loop through the + * dev_base_head list, and hold dev_base_lock for writing when they do the + * actual updates. This allows pure readers to access the list even + * while a writer is preparing to update it. + * + * To put it another way, dev_base_lock is held for writing only to + * protect against pure readers; the rtnl semaphore provides the + * protection against other writers. + * + * See, for example usages, register_netdevice() and + * unregister_netdevice(), which must be called with the rtnl + * semaphore held. + */ +DEFINE_RWLOCK(dev_base_lock); + +EXPORT_SYMBOL(dev_base_lock); + +#define NETDEV_HASHBITS 8 +#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) + +static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) +{ + unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; +} + +static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) +{ + return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; +} + +/* Device list insertion */ +static int list_netdevice(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + ASSERT_RTNL(); + + write_lock_bh(&dev_base_lock); + list_add_tail(&dev->dev_list, &net->dev_base_head); + hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); + hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); + write_unlock_bh(&dev_base_lock); + return 0; +} + +/* Device list removal */ +static void unlist_netdevice(struct net_device *dev) +{ + ASSERT_RTNL(); + + /* Unlink dev from the device chain */ + write_lock_bh(&dev_base_lock); + list_del(&dev->dev_list); + hlist_del(&dev->name_hlist); + hlist_del(&dev->index_hlist); + write_unlock_bh(&dev_base_lock); +} + +/* + * Our notifier list + */ + +static RAW_NOTIFIER_HEAD(netdev_chain); + +/* + * Device drivers call our routines to queue packets here. We empty the + * queue in the local softnet handler. + */ + +DEFINE_PER_CPU(struct softnet_data, softnet_data); + +#ifdef CONFIG_LOCKDEP +/* + * register_netdevice() inits txq->_xmit_lock and sets lockdep class + * according to dev->type + */ +static const unsigned short netdev_lock_type[] = + {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, + ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, + ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, + ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE}; + +static const char *netdev_lock_name[] = + {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", + "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", + "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"}; + +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; +static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; + +static inline unsigned short netdev_lock_pos(unsigned short dev_type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) + if (netdev_lock_type[i] == dev_type) + return i; + /* the last key is used by default */ + return ARRAY_SIZE(netdev_lock_type) - 1; +} + +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ + int i; + + i = netdev_lock_pos(dev_type); + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], + netdev_lock_name[i]); +} + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ + int i; + + i = netdev_lock_pos(dev->type); + lockdep_set_class_and_name(&dev->addr_list_lock, + &netdev_addr_lock_key[i], + netdev_lock_name[i]); +} +#else +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ +} +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ +} +#endif + +/******************************************************************************* + + Protocol management and registration routines + +*******************************************************************************/ + +/* + * Add a protocol ID to the list. Now that the input handler is + * smarter we can dispense with all the messy stuff that used to be + * here. + * + * BEWARE!!! Protocol handlers, mangling input packets, + * MUST BE last in hash buckets and checking protocol handlers + * MUST start from promiscuous ptype_all chain in net_bh. + * It is true now, do not change it. + * Explanation follows: if protocol handler, mangling packet, will + * be the first on list, it is not able to sense, that packet + * is cloned and should be copied-on-write, so that it will + * change it and subsequent readers will get broken packet. + * --ANK (980803) + */ + +/** + * dev_add_pack - add packet handler + * @pt: packet type declaration + * + * Add a protocol handler to the networking stack. The passed &packet_type + * is linked into kernel lists and may not be freed until it has been + * removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new packet type (until the next received packet). + */ + +void dev_add_pack(struct packet_type *pt) +{ + int hash; + + spin_lock_bh(&ptype_lock); + if (pt->type == htons(ETH_P_ALL)) + list_add_rcu(&pt->list, &ptype_all); + else { + hash = ntohs(pt->type) & PTYPE_HASH_MASK; + list_add_rcu(&pt->list, &ptype_base[hash]); + } + spin_unlock_bh(&ptype_lock); +} + +/** + * __dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +void __dev_remove_pack(struct packet_type *pt) +{ + struct list_head *head; + struct packet_type *pt1; + + spin_lock_bh(&ptype_lock); + + if (pt->type == htons(ETH_P_ALL)) + head = &ptype_all; + else + head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; + + list_for_each_entry(pt1, head, list) { + if (pt == pt1) { + list_del_rcu(&pt->list); + goto out; + } + } + + printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); +out: + spin_unlock_bh(&ptype_lock); +} +/** + * dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_pack(struct packet_type *pt) +{ + __dev_remove_pack(pt); + + synchronize_net(); +} + +/****************************************************************************** + + Device Boot-time Settings Routines + +*******************************************************************************/ + +/* Boot time configuration table */ +static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; + +/** + * netdev_boot_setup_add - add new setup entry + * @name: name of the device + * @map: configured settings for the device + * + * Adds new setup entry to the dev_boot_setup list. The function + * returns 0 on error and 1 on success. This is a generic routine to + * all netdevices. + */ +static int netdev_boot_setup_add(char *name, struct ifmap *map) +{ + struct netdev_boot_setup *s; + int i; + + s = dev_boot_setup; + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { + if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { + memset(s[i].name, 0, sizeof(s[i].name)); + strlcpy(s[i].name, name, IFNAMSIZ); + memcpy(&s[i].map, map, sizeof(s[i].map)); + break; + } + } + + return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; +} + +/** + * netdev_boot_setup_check - check boot time settings + * @dev: the netdevice + * + * Check boot time settings for the device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found, 1 if they are. + */ +int netdev_boot_setup_check(struct net_device *dev) +{ + struct netdev_boot_setup *s = dev_boot_setup; + int i; + + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { + if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && + !strcmp(dev->name, s[i].name)) { + dev->irq = s[i].map.irq; + dev->base_addr = s[i].map.base_addr; + dev->mem_start = s[i].map.mem_start; + dev->mem_end = s[i].map.mem_end; + return 1; + } + } + return 0; +} + + +/** + * netdev_boot_base - get address from boot time settings + * @prefix: prefix for network device + * @unit: id for network device + * + * Check boot time settings for the base address of device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found. + */ +unsigned long netdev_boot_base(const char *prefix, int unit) +{ + const struct netdev_boot_setup *s = dev_boot_setup; + char name[IFNAMSIZ]; + int i; + + sprintf(name, "%s%d", prefix, unit); + + /* + * If device already registered then return base of 1 + * to indicate not to probe for this interface + */ + if (__dev_get_by_name(&init_net, name)) + return 1; + + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) + if (!strcmp(name, s[i].name)) + return s[i].map.base_addr; + return 0; +} + +#ifndef DDE_LINUX +/* + * Saves at boot time configured settings for any netdevice. + */ +int __init netdev_boot_setup(char *str) +{ + int ints[5]; + struct ifmap map; + + str = get_options(str, ARRAY_SIZE(ints), ints); + if (!str || !*str) + return 0; + + /* Save settings */ + memset(&map, 0, sizeof(map)); + if (ints[0] > 0) + map.irq = ints[1]; + if (ints[0] > 1) + map.base_addr = ints[2]; + if (ints[0] > 2) + map.mem_start = ints[3]; + if (ints[0] > 3) + map.mem_end = ints[4]; + + /* Add new entry to the list */ + return netdev_boot_setup_add(str, &map); +} +#endif + +__setup("netdev=", netdev_boot_setup); + +/******************************************************************************* + + Device Interface Subroutines + +*******************************************************************************/ + +/** + * __dev_get_by_name - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. Must be called under RTNL semaphore + * or @dev_base_lock. If the name is found a pointer to the device + * is returned. If the name is not found then %NULL is returned. The + * reference counters are not incremented so the caller must be + * careful with locks. + */ + +struct net_device *__dev_get_by_name(struct net *net, const char *name) +{ + struct hlist_node *p; + + hlist_for_each(p, dev_name_hash(net, name)) { + struct net_device *dev + = hlist_entry(p, struct net_device, name_hlist); + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + } + return NULL; +} + +/** + * dev_get_by_name - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. This can be called from any + * context and does its own locking. The returned handle has + * the usage count incremented and the caller must use dev_put() to + * release it when it is no longer needed. %NULL is returned if no + * matching device is found. + */ + +struct net_device *dev_get_by_name(struct net *net, const char *name) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + dev = __dev_get_by_name(net, name); + if (dev) + dev_hold(dev); + read_unlock(&dev_base_lock); + return dev; +} + +/** + * __dev_get_by_index - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold either the RTNL semaphore + * or @dev_base_lock. + */ + +struct net_device *__dev_get_by_index(struct net *net, int ifindex) +{ + struct hlist_node *p; + + hlist_for_each(p, dev_index_hash(net, ifindex)) { + struct net_device *dev + = hlist_entry(p, struct net_device, index_hlist); + if (dev->ifindex == ifindex) + return dev; + } + return NULL; +} + + +/** + * dev_get_by_index - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns NULL if the device + * is not found or a pointer to the device. The device returned has + * had a reference added and the pointer is safe until the user calls + * dev_put to indicate they have finished with it. + */ + +struct net_device *dev_get_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + dev = __dev_get_by_index(net, ifindex); + if (dev) + dev_hold(dev); + read_unlock(&dev_base_lock); + return dev; +} + +/** + * dev_getbyhwaddr - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Search for an interface by MAC address. Returns NULL if the device + * is not found or a pointer to the device. The caller must hold the + * rtnl semaphore. The returned device has not had its ref count increased + * and the caller must therefore be careful about locking + * + * BUGS: + * If the API was consistent this would be __dev_get_by_hwaddr + */ + +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + + for_each_netdev(net, dev) + if (dev->type == type && + !memcmp(dev->dev_addr, ha, dev->addr_len)) + return dev; + + return NULL; +} + +EXPORT_SYMBOL(dev_getbyhwaddr); + +struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev->type == type) + return dev; + + return NULL; +} + +EXPORT_SYMBOL(__dev_getfirstbyhwtype); + +struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) +{ + struct net_device *dev; + + rtnl_lock(); + dev = __dev_getfirstbyhwtype(net, type); + if (dev) + dev_hold(dev); + rtnl_unlock(); + return dev; +} + +EXPORT_SYMBOL(dev_getfirstbyhwtype); + +/** + * dev_get_by_flags - find any device with given flags + * @net: the applicable net namespace + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check + * + * Search for any interface with the given flags. Returns NULL if a device + * is not found or a pointer to the device. The device returned has + * had a reference added and the pointer is safe until the user calls + * dev_put to indicate they have finished with it. + */ + +struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) +{ + struct net_device *dev, *ret; + + ret = NULL; + read_lock(&dev_base_lock); + for_each_netdev(net, dev) { + if (((dev->flags ^ if_flags) & mask) == 0) { + dev_hold(dev); + ret = dev; + break; + } + } + read_unlock(&dev_base_lock); + return ret; +} + +/** + * dev_valid_name - check if name is okay for network device + * @name: name string + * + * Network device names need to be valid file names to + * to allow sysfs to work. We also disallow any kind of + * whitespace. + */ +int dev_valid_name(const char *name) +{ + if (*name == '\0') + return 0; + if (strlen(name) >= IFNAMSIZ) + return 0; + if (!strcmp(name, ".") || !strcmp(name, "..")) + return 0; + + while (*name) { + if (*name == '/' || isspace(*name)) + return 0; + name++; + } + return 1; +} + +/** + * __dev_alloc_name - allocate a name for a device + * @net: network namespace to allocate the device name in + * @name: name format string + * @buf: scratch buffer and result name string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. + */ + +static int __dev_alloc_name(struct net *net, const char *name, char *buf) +{ + int i = 0; + const char *p; + const int max_netdevices = 8*PAGE_SIZE; + unsigned long *inuse; + struct net_device *d; + + p = strnchr(name, IFNAMSIZ-1, '%'); + if (p) { + /* + * Verify the string as this thing may have come from + * the user. There must be either one "%d" and no other "%" + * characters. + */ + if (p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; + + /* Use one page as a bit array of possible slots */ + inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); + if (!inuse) + return -ENOMEM; + + for_each_netdev(net, d) { + if (!sscanf(d->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); + if (!strncmp(buf, d->name, IFNAMSIZ)) + set_bit(i, inuse); + } + + i = find_first_zero_bit(inuse, max_netdevices); + free_page((unsigned long) inuse); + } + + snprintf(buf, IFNAMSIZ, name, i); + if (!__dev_get_by_name(net, buf)) + return i; + + /* It is possible to run out of possible slots + * when the name is long and there isn't enough space left + * for the digits, or if all bits are used. + */ + return -ENFILE; +} + +/** + * dev_alloc_name - allocate a name for a device + * @dev: device + * @name: name format string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. + */ + +int dev_alloc_name(struct net_device *dev, const char *name) +{ + char buf[IFNAMSIZ]; + struct net *net; + int ret; + + BUG_ON(!dev_net(dev)); + net = dev_net(dev); + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; +} + + +/** + * dev_change_name - change name of a device + * @dev: device + * @newname: name (or format string) must be at least IFNAMSIZ + * + * Change name of a device, can pass format strings "eth%d". + * for wildcarding. + */ +int dev_change_name(struct net_device *dev, const char *newname) +{ + char oldname[IFNAMSIZ]; + int err = 0; + int ret; + struct net *net; + + ASSERT_RTNL(); + BUG_ON(!dev_net(dev)); + + net = dev_net(dev); + if (dev->flags & IFF_UP) + return -EBUSY; + + if (!dev_valid_name(newname)) + return -EINVAL; + + if (strncmp(newname, dev->name, IFNAMSIZ) == 0) + return 0; + + memcpy(oldname, dev->name, IFNAMSIZ); + + if (strchr(newname, '%')) { + err = dev_alloc_name(dev, newname); + if (err < 0) + return err; + } + else if (__dev_get_by_name(net, newname)) + return -EEXIST; + else + strlcpy(dev->name, newname, IFNAMSIZ); + +rollback: + /* For now only devices in the initial network namespace + * are in sysfs. + */ + if (net == &init_net) { + ret = device_rename(&dev->dev, dev->name); + if (ret) { + memcpy(dev->name, oldname, IFNAMSIZ); + return ret; + } + } + + write_lock_bh(&dev_base_lock); + hlist_del(&dev->name_hlist); + hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); + write_unlock_bh(&dev_base_lock); + + ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); + ret = notifier_to_errno(ret); + + if (ret) { + if (err) { + printk(KERN_ERR + "%s: name change rollback failed: %d.\n", + dev->name, ret); + } else { + err = ret; + memcpy(dev->name, oldname, IFNAMSIZ); + goto rollback; + } + } + + return err; +} + +/** + * dev_set_alias - change ifalias of a device + * @dev: device + * @alias: name up to IFALIASZ + * @len: limit of bytes to copy from info + * + * Set ifalias for a device, + */ +int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +{ + ASSERT_RTNL(); + + if (len >= IFALIASZ) + return -EINVAL; + + if (!len) { + if (dev->ifalias) { + kfree(dev->ifalias); + dev->ifalias = NULL; + } + return 0; + } + + dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL); + if (!dev->ifalias) + return -ENOMEM; + + strlcpy(dev->ifalias, alias, len+1); + return len; +} + + +/** + * netdev_features_change - device changes features + * @dev: device to cause notification + * + * Called to indicate a device has changed features. + */ +void netdev_features_change(struct net_device *dev) +{ + call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL(netdev_features_change); + +/** + * netdev_state_change - device changes state + * @dev: device to cause notification + * + * Called to indicate a device has changed state. This function calls + * the notifier chains for netdev_chain and sends a NEWLINK message + * to the routing socket. + */ +void netdev_state_change(struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifiers(NETDEV_CHANGE, dev); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + } +} + +void netdev_bonding_change(struct net_device *dev) +{ + call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev); +} +EXPORT_SYMBOL(netdev_bonding_change); + +/** + * dev_load - load a network module + * @net: the applicable net namespace + * @name: name of interface + * + * If a network interface is not present and the process has suitable + * privileges this function loads the module. If module loading is not + * available in this kernel then it becomes a nop. + */ + +void dev_load(struct net *net, const char *name) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + dev = __dev_get_by_name(net, name); + read_unlock(&dev_base_lock); + + if (!dev && capable(CAP_SYS_MODULE)) + request_module("%s", name); +} + +/** + * dev_open - prepare an interface for use. + * @dev: device to open + * + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. + */ +int dev_open(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int ret = 0; + + ASSERT_RTNL(); + + /* + * Is it already up? + */ + + if (dev->flags & IFF_UP) + return 0; + + /* + * Is it even present? + */ + if (!netif_device_present(dev)) + return -ENODEV; + + /* + * Call device private open method + */ + set_bit(__LINK_STATE_START, &dev->state); + + if (ops->ndo_validate_addr) + ret = ops->ndo_validate_addr(dev); + + if (!ret && ops->ndo_open) + ret = ops->ndo_open(dev); + + /* + * If it went open OK then: + */ + + if (ret) + clear_bit(__LINK_STATE_START, &dev->state); + else { + /* + * Set the flags. + */ + dev->flags |= IFF_UP; + + /* + * Enable NET_DMA + */ + net_dmaengine_get(); + + /* + * Initialize multicasting status + */ + dev_set_rx_mode(dev); + + /* + * Wakeup transmit queue engine + */ + dev_activate(dev); + + /* + * ... and announce new interface. + */ + call_netdevice_notifiers(NETDEV_UP, dev); + } + + return ret; +} + +/** + * dev_close - shutdown an interface. + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +int dev_close(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + ASSERT_RTNL(); + + might_sleep(); + + if (!(dev->flags & IFF_UP)) + return 0; + + /* + * Tell people we are going down, so that they can + * prepare to death, when device is still operating. + */ + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); + + clear_bit(__LINK_STATE_START, &dev->state); + + /* Synchronize to scheduled poll. We cannot touch poll list, + * it can be even on different cpu. So just clear netif_running(). + * + * dev->stop() will invoke napi_disable() on all of it's + * napi_struct instances on this device. + */ + smp_mb__after_clear_bit(); /* Commit netif_running(). */ + + dev_deactivate(dev); + + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + * + * We allow it to be called even after a DETACH hot-plug + * event. + */ + if (ops->ndo_stop) + ops->ndo_stop(dev); + + /* + * Device is now down. + */ + + dev->flags &= ~IFF_UP; + + /* + * Tell people we are down + */ + call_netdevice_notifiers(NETDEV_DOWN, dev); + + /* + * Shutdown NET_DMA + */ + net_dmaengine_put(); + + return 0; +} + + +/** + * dev_disable_lro - disable Large Receive Offload on a device + * @dev: device + * + * Disable Large Receive Offload (LRO) on a net device. Must be + * called under RTNL. This is needed if received packets may be + * forwarded to another interface. + */ +void dev_disable_lro(struct net_device *dev) +{ + if (dev->ethtool_ops && dev->ethtool_ops->get_flags && + dev->ethtool_ops->set_flags) { + u32 flags = dev->ethtool_ops->get_flags(dev); + if (flags & ETH_FLAG_LRO) { + flags &= ~ETH_FLAG_LRO; + dev->ethtool_ops->set_flags(dev, flags); + } + } + WARN_ON(dev->features & NETIF_F_LRO); +} +EXPORT_SYMBOL(dev_disable_lro); + + +static int dev_boot_phase = 1; + +/* + * Device change register/unregister. These are not inline or static + * as we export them to the world. + */ + +/** + * register_netdevice_notifier - register a network notifier block + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier(struct notifier_block *nb) +{ + struct net_device *dev; + struct net_device *last; + struct net *net; + int err; + + rtnl_lock(); + err = raw_notifier_chain_register(&netdev_chain, nb); + if (err) + goto unlock; + if (dev_boot_phase) + goto unlock; + for_each_net(net) { + for_each_netdev(net, dev) { + err = nb->notifier_call(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + goto rollback; + + if (!(dev->flags & IFF_UP)) + continue; + + nb->notifier_call(nb, NETDEV_UP, dev); + } + } + +unlock: + rtnl_unlock(); + return err; + +rollback: + last = dev; + for_each_net(net) { + for_each_netdev(net, dev) { + if (dev == last) + break; + + if (dev->flags & IFF_UP) { + nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); + nb->notifier_call(nb, NETDEV_DOWN, dev); + } + nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + } + } + + raw_notifier_chain_unregister(&netdev_chain, nb); + goto unlock; +} + +/** + * unregister_netdevice_notifier - unregister a network notifier block + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + */ + +int unregister_netdevice_notifier(struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = raw_notifier_chain_unregister(&netdev_chain, nb); + rtnl_unlock(); + return err; +} + +/** + * call_netdevice_notifiers - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +int call_netdevice_notifiers(unsigned long val, struct net_device *dev) +{ + return raw_notifier_call_chain(&netdev_chain, val, dev); +} + +/* When > 0 there are consumers of rx skb time stamps */ +static atomic_t netstamp_needed = ATOMIC_INIT(0); + +void net_enable_timestamp(void) +{ + atomic_inc(&netstamp_needed); +} + +void net_disable_timestamp(void) +{ + atomic_dec(&netstamp_needed); +} + +static inline void net_timestamp(struct sk_buff *skb) +{ + if (atomic_read(&netstamp_needed)) + __net_timestamp(skb); + else + skb->tstamp.tv64 = 0; +} + +/* + * Support routine. Sends outgoing frames to any network + * taps currently in use. + */ + +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +{ + struct packet_type *ptype; + + net_timestamp(skb); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &ptype_all, list) { + /* Never send packets back to the socket + * they originated from - MvS (miquels@drinkel.ow.org) + */ + if ((ptype->dev == dev || !ptype->dev) && + (ptype->af_packet_priv == NULL || + (struct sock *)ptype->af_packet_priv != skb->sk)) { + struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); + if (!skb2) + break; + + /* skb->nh should be correctly + set by sender, so that the second statement is + just protection against buggy protocols. + */ + skb_reset_mac_header(skb2); + + if (skb_network_header(skb2) < skb2->data || + skb2->network_header > skb2->tail) { + if (net_ratelimit()) + printk(KERN_CRIT "protocol %04x is " + "buggy, dev %s\n", + skb2->protocol, dev->name); + skb_reset_network_header(skb2); + } + + skb2->transport_header = skb2->network_header; + skb2->pkt_type = PACKET_OUTGOING; + ptype->func(skb2, skb->dev, ptype, skb->dev); + } + } + rcu_read_unlock(); +} + + +static inline void __netif_reschedule(struct Qdisc *q) +{ + struct softnet_data *sd; + unsigned long flags; + + local_irq_save(flags); + sd = &__get_cpu_var(softnet_data); + q->next_sched = sd->output_queue; + sd->output_queue = q; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); +} + +void __netif_schedule(struct Qdisc *q) +{ + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) + __netif_reschedule(q); +} +EXPORT_SYMBOL(__netif_schedule); + +void dev_kfree_skb_irq(struct sk_buff *skb) +{ + if (atomic_dec_and_test(&skb->users)) { + struct softnet_data *sd; + unsigned long flags; + + local_irq_save(flags); + sd = &__get_cpu_var(softnet_data); + skb->next = sd->completion_queue; + sd->completion_queue = skb; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); + } +} +EXPORT_SYMBOL(dev_kfree_skb_irq); + +void dev_kfree_skb_any(struct sk_buff *skb) +{ + if (in_irq() || irqs_disabled()) + dev_kfree_skb_irq(skb); + else + dev_kfree_skb(skb); +} +EXPORT_SYMBOL(dev_kfree_skb_any); + + +/** + * netif_device_detach - mark device as removed + * @dev: network device + * + * Mark device as removed from system and therefore no longer available. + */ +void netif_device_detach(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_stop_queue(dev); + } +} +EXPORT_SYMBOL(netif_device_detach); + +/** + * netif_device_attach - mark device as attached + * @dev: network device + * + * Mark device as attached from system and restart if needed. + */ +void netif_device_attach(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_wake_queue(dev); + __netdev_watchdog_up(dev); + } +} +EXPORT_SYMBOL(netif_device_attach); + +static bool can_checksum_protocol(unsigned long features, __be16 protocol) +{ + return ((features & NETIF_F_GEN_CSUM) || + ((features & NETIF_F_IP_CSUM) && + protocol == htons(ETH_P_IP)) || + ((features & NETIF_F_IPV6_CSUM) && + protocol == htons(ETH_P_IPV6))); +} + +static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) +{ + if (can_checksum_protocol(dev->features, skb->protocol)) + return true; + + if (skb->protocol == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + if (can_checksum_protocol(dev->features & dev->vlan_features, + veh->h_vlan_encapsulated_proto)) + return true; + } + + return false; +} + +/* + * Invalidate hardware checksum when packet is to be mangled, and + * complete checksum manually on outgoing path. + */ +int skb_checksum_help(struct sk_buff *skb) +{ + __wsum csum; + int ret = 0, offset; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + goto out_set_summed; + + if (unlikely(skb_shinfo(skb)->gso_size)) { + /* Let GSO fix up the checksum. */ + goto out_set_summed; + } + + offset = skb->csum_start - skb_headroom(skb); + BUG_ON(offset >= skb_headlen(skb)); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + + offset += skb->csum_offset; + BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); + + if (skb_cloned(skb) && + !skb_clone_writable(skb, offset + sizeof(__sum16))) { + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (ret) + goto out; + } + + *(__sum16 *)(skb->data + offset) = csum_fold(csum); +out_set_summed: + skb->ip_summed = CHECKSUM_NONE; +out: + return ret; +} + +/** + * skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + */ +struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_type *ptype; + __be16 type = skb->protocol; + int err; + + skb_reset_mac_header(skb); + skb->mac_len = skb->network_header - skb->mac_header; + __skb_pull(skb, skb->mac_len); + + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + struct net_device *dev = skb->dev; + struct ethtool_drvinfo info = {}; + + if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) + dev->ethtool_ops->get_drvinfo(dev, &info); + + WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " + "ip_summed=%d", + info.driver, dev ? dev->features : 0L, + skb->sk ? skb->sk->sk_route_caps : 0L, + skb->len, skb->data_len, skb->ip_summed); + + if (skb_header_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return ERR_PTR(err); + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, + &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { + if (ptype->type == type && !ptype->dev && ptype->gso_segment) { + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + err = ptype->gso_send_check(skb); + segs = ERR_PTR(err); + if (err || skb_gso_ok(skb, features)) + break; + __skb_push(skb, (skb->data - + skb_network_header(skb))); + } + segs = ptype->gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} + +EXPORT_SYMBOL(skb_gso_segment); + +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +void netdev_rx_csum_fault(struct net_device *dev) +{ + if (net_ratelimit()) { + printk(KERN_ERR "%s: hw csum failure.\n", + dev ? dev->name : "<unknown>"); + dump_stack(); + } +} +EXPORT_SYMBOL(netdev_rx_csum_fault); +#endif + +/* Actually, we should eliminate this check as soon as we know, that: + * 1. IOMMU is present and allows to map all the memory. + * 2. No high memory really exists on this machine. + */ + +static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_HIGHMEM + int i; + + if (dev->features & NETIF_F_HIGHDMA) + return 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + if (PageHighMem(skb_shinfo(skb)->frags[i].page)) + return 1; + +#endif + return 0; +} + +struct dev_gso_cb { + void (*destructor)(struct sk_buff *skb); +}; + +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) + +static void dev_gso_skb_destructor(struct sk_buff *skb) +{ + struct dev_gso_cb *cb; + + do { + struct sk_buff *nskb = skb->next; + + skb->next = nskb->next; + nskb->next = NULL; + kfree_skb(nskb); + } while (skb->next); + + cb = DEV_GSO_CB(skb); + if (cb->destructor) + cb->destructor(skb); +} + +/** + * dev_gso_segment - Perform emulated hardware segmentation on skb. + * @skb: buffer to segment + * + * This function segments the given skb and stores the list of segments + * in skb->next. + */ +static int dev_gso_segment(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct sk_buff *segs; + int features = dev->features & ~(illegal_highdma(dev, skb) ? + NETIF_F_SG : 0); + + segs = skb_gso_segment(skb, features); + + /* Verifying header integrity only. */ + if (!segs) + return 0; + + if (IS_ERR(segs)) + return PTR_ERR(segs); + + skb->next = segs; + DEV_GSO_CB(skb)->destructor = skb->destructor; + skb->destructor = dev_gso_skb_destructor; + + return 0; +} + +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + prefetch(&dev->netdev_ops->ndo_start_xmit); + if (likely(!skb->next)) { + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + + if (netif_needs_gso(dev, skb)) { + if (unlikely(dev_gso_segment(skb))) + goto out_kfree_skb; + if (skb->next) + goto gso; + } + + return ops->ndo_start_xmit(skb, dev); + } + +gso: + do { + struct sk_buff *nskb = skb->next; + int rc; + + skb->next = nskb->next; + nskb->next = NULL; + rc = ops->ndo_start_xmit(nskb, dev); + if (unlikely(rc)) { + nskb->next = skb->next; + skb->next = nskb; + return rc; + } + if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) + return NETDEV_TX_BUSY; + } while (skb->next); + + skb->destructor = DEV_GSO_CB(skb)->destructor; + +out_kfree_skb: + kfree_skb(skb); + return 0; +} + +static u32 simple_tx_hashrnd; +static int simple_tx_hashrnd_initialized = 0; + +static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) +{ + u32 addr1, addr2, ports; + u32 hash, ihl; + u8 ip_proto = 0; + + if (unlikely(!simple_tx_hashrnd_initialized)) { + get_random_bytes(&simple_tx_hashrnd, 4); + simple_tx_hashrnd_initialized = 1; + } + + switch (skb->protocol) { + case htons(ETH_P_IP): + if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))) + ip_proto = ip_hdr(skb)->protocol; + addr1 = ip_hdr(skb)->saddr; + addr2 = ip_hdr(skb)->daddr; + ihl = ip_hdr(skb)->ihl; + break; + case htons(ETH_P_IPV6): + ip_proto = ipv6_hdr(skb)->nexthdr; + addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3]; + addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3]; + ihl = (40 >> 2); + break; + default: + return 0; + } + + + switch (ip_proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_DCCP: + case IPPROTO_ESP: + case IPPROTO_AH: + case IPPROTO_SCTP: + case IPPROTO_UDPLITE: + ports = *((u32 *) (skb_network_header(skb) + (ihl * 4))); + break; + + default: + ports = 0; + break; + } + + hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd); + + return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); +} + +static struct netdev_queue *dev_pick_tx(struct net_device *dev, + struct sk_buff *skb) +{ + const struct net_device_ops *ops = dev->netdev_ops; + u16 queue_index = 0; + + if (ops->ndo_select_queue) + queue_index = ops->ndo_select_queue(dev, skb); + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + +/** + * dev_queue_xmit - transmit a buffer + * @skb: buffer to transmit + * + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. + * + * A negative errno code is returned on a failure. A success does not + * guarantee the frame will be transmitted as it may be dropped due + * to congestion or traffic shaping. + * + * ----------------------------------------------------------------------------------- + * I notice this method can also return errors from the queue disciplines, + * including NET_XMIT_DROP, which is a positive value. So, errors can also + * be positive. + * + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) + * + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. + * --BLG + */ +int dev_queue_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + int rc = -ENOMEM; + + /* GSO will handle the following emulations directly. */ + if (netif_needs_gso(dev, skb)) + goto gso; + + if (skb_shinfo(skb)->frag_list && + !(dev->features & NETIF_F_FRAGLIST) && + __skb_linearize(skb)) + goto out_kfree_skb; + + /* Fragmented skb is linearized if device does not support SG, + * or if at least one of fragments is in highmem and device + * does not support DMA from it. + */ + if (skb_shinfo(skb)->nr_frags && + (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && + __skb_linearize(skb)) + goto out_kfree_skb; + + /* If packet is not checksummed and device does not support + * checksumming for this protocol, complete checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb_set_transport_header(skb, skb->csum_start - + skb_headroom(skb)); + if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb)) + goto out_kfree_skb; + } + +gso: + /* Disable soft irqs for various locks below. Also + * stops preemption for RCU. + */ + rcu_read_lock_bh(); + + txq = dev_pick_tx(dev, skb); + +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); +#endif + + /* The device has no queue. Common case for software devices: + loopback, all the sorts of tunnels... + + Really, it is unlikely that netif_tx_lock protection is necessary + here. (f.e. loopback and IP tunnels are clean ignoring statistics + counters.) + However, it is possible, that they rely on protection + made by us here. + + Check this and shot the lock. It is not prone from deadlocks. + Either shot noqueue qdisc, it is even simpler 8) + */ + if (dev->flags & IFF_UP) { + int cpu = smp_processor_id(); /* ok because BHs are off */ + + if (txq->xmit_lock_owner != cpu) { + + HARD_TX_LOCK(dev, txq, cpu); + + if (!netif_tx_queue_stopped(txq)) { + rc = 0; + if (!dev_hard_start_xmit(skb, dev, txq)) { + HARD_TX_UNLOCK(dev, txq); + goto out; + } + } + HARD_TX_UNLOCK(dev, txq); + if (net_ratelimit()) + printk(KERN_CRIT "Virtual device %s asks to " + "queue packet!\n", dev->name); + } else { + /* Recursion is detected! It is possible, + * unfortunately */ + if (net_ratelimit()) + printk(KERN_CRIT "Dead loop on virtual device " + "%s, fix it urgently!\n", dev->name); + } + } + + rc = -ENETDOWN; + rcu_read_unlock_bh(); + +out_kfree_skb: + kfree_skb(skb); + return rc; +out: + rcu_read_unlock_bh(); + return rc; +} + + +/*======================================================================= + Receiver routines + =======================================================================*/ + +int netdev_max_backlog __read_mostly = 1000; +int netdev_budget __read_mostly = 300; +int weight_p __read_mostly = 64; /* old backlog weight */ + +DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; + + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process. It always succeeds. The buffer + * may be dropped during processing for congestion control or by the + * protocol layers. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ +#ifndef DDE_LINUX + struct softnet_data *queue; + unsigned long flags; + + /* if netpoll wants it, pretend we never saw it */ + if (netpoll_rx(skb)) + return NET_RX_DROP; + + if (!skb->tstamp.tv64) + net_timestamp(skb); + + /* + * The code is rearranged so that the path is the most + * short when CPU is congested, but is still operating. + */ + local_irq_save(flags); + queue = &__get_cpu_var(softnet_data); + + __get_cpu_var(netdev_rx_stat).total++; + if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { + if (queue->input_pkt_queue.qlen) { +enqueue: + dev_hold(skb->dev); + __skb_queue_tail(&queue->input_pkt_queue, skb); + local_irq_restore(flags); + return NET_RX_SUCCESS; + } + + napi_schedule(&queue->backlog); + goto enqueue; + } + + __get_cpu_var(netdev_rx_stat).dropped++; + local_irq_restore(flags); + + kfree_skb(skb); + return NET_RX_DROP; +#else /* DDE_LINUX */ + /* call our callback fn */ + return l4dde26_do_rx_callback(skb); +#endif +} + +int netif_rx_ni(struct sk_buff *skb) +{ + int err; + + preempt_disable(); + err = netif_rx(skb); + if (local_softirq_pending()) + do_softirq(); + preempt_enable(); + + return err; +} + +EXPORT_SYMBOL(netif_rx_ni); + +static void net_tx_action(struct softirq_action *h) +{ + struct softnet_data *sd = &__get_cpu_var(softnet_data); + + if (sd->completion_queue) { + struct sk_buff *clist; + + local_irq_disable(); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_enable(); + + while (clist) { + struct sk_buff *skb = clist; + clist = clist->next; + + WARN_ON(atomic_read(&skb->users)); + __kfree_skb(skb); + } + } + + if (sd->output_queue) { + struct Qdisc *head; + + local_irq_disable(); + head = sd->output_queue; + sd->output_queue = NULL; + local_irq_enable(); + + while (head) { + struct Qdisc *q = head; + spinlock_t *root_lock; + + head = head->next_sched; + + root_lock = qdisc_lock(q); + if (spin_trylock(root_lock)) { + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, + &q->state); + qdisc_run(q); + spin_unlock(root_lock); + } else { + if (!test_bit(__QDISC_STATE_DEACTIVATED, + &q->state)) { + __netif_reschedule(q); + } else { + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, + &q->state); + } + } + } + } +} + +static inline int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + atomic_inc(&skb->users); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) +/* These hooks defined here for ATM */ +struct net_bridge; +struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, + unsigned char *addr); +void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly; + +/* + * If bridge module is loaded call bridging hook. + * returns NULL if packet was consumed. + */ +struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, + struct sk_buff *skb) __read_mostly; +static inline struct sk_buff *handle_bridge(struct sk_buff *skb, + struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev) +{ + struct net_bridge_port *port; + + if (skb->pkt_type == PACKET_LOOPBACK || + (port = rcu_dereference(skb->dev->br_port)) == NULL) + return skb; + + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + return br_handle_frame_hook(port, skb); +} +#else +#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb) +#endif + +#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) +struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly; +EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook); + +static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, + struct packet_type **pt_prev, + int *ret, + struct net_device *orig_dev) +{ + if (skb->dev->macvlan_port == NULL) + return skb; + + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + return macvlan_handle_frame_hook(skb); +} +#else +#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb) +#endif + +#ifdef CONFIG_NET_CLS_ACT +/* TODO: Maybe we should just force sch_ingress to be compiled in + * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions + * a compare and 2 stores extra right now if we dont have it on + * but have CONFIG_NET_CLS_ACT + * NOTE: This doesnt stop any functionality; if you dont have + * the ingress scheduler, you just cant add policies on ingress. + * + */ +static int ing_filter(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + u32 ttl = G_TC_RTTL(skb->tc_verd); + struct netdev_queue *rxq; + int result = TC_ACT_OK; + struct Qdisc *q; + + if (MAX_RED_LOOP < ttl++) { + printk(KERN_WARNING + "Redir loop detected Dropping packet (%d->%d)\n", + skb->iif, dev->ifindex); + return TC_ACT_SHOT; + } + + skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + + rxq = &dev->rx_queue; + + q = rxq->qdisc; + if (q != &noop_qdisc) { + spin_lock(qdisc_lock(q)); + if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) + result = qdisc_enqueue_root(skb, q); + spin_unlock(qdisc_lock(q)); + } + + return result; +} + +static inline struct sk_buff *handle_ing(struct sk_buff *skb, + struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + if (skb->dev->rx_queue.qdisc == &noop_qdisc) + goto out; + + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } else { + /* Huh? Why does turning on AF_PACKET affect this? */ + skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); + } + + switch (ing_filter(skb)) { + case TC_ACT_SHOT: + case TC_ACT_STOLEN: + kfree_skb(skb); + return NULL; + } + +out: + skb->tc_verd = 0; + return skb; +} +#endif + +/* + * netif_nit_deliver - deliver received packets to network taps + * @skb: buffer + * + * This function is used to deliver incoming packets to network + * taps. It should be used when the normal netif_receive_skb path + * is bypassed, for example because of VLAN acceleration. + */ +void netif_nit_deliver(struct sk_buff *skb) +{ + struct packet_type *ptype; + + if (list_empty(&ptype_all)) + return; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->mac_len = skb->network_header - skb->mac_header; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &ptype_all, list) { + if (!ptype->dev || ptype->dev == skb->dev) + deliver_skb(skb, ptype, skb->dev); + } + rcu_read_unlock(); +} + +/** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process + * + * netif_receive_skb() is the main receive data processing function. + * It always succeeds. The buffer may be dropped during processing + * for congestion control or by the protocol layers. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ +#ifndef DDE_LINUX + struct packet_type *ptype, *pt_prev; + struct net_device *orig_dev; + struct net_device *null_or_orig; + int ret = NET_RX_DROP; + __be16 type; + + if (skb->vlan_tci && vlan_hwaccel_do_receive(skb)) + return NET_RX_SUCCESS; + + /* if we've gotten here through NAPI, check netpoll */ + if (netpoll_receive_skb(skb)) + return NET_RX_DROP; + + if (!skb->tstamp.tv64) + net_timestamp(skb); + + if (!skb->iif) + skb->iif = skb->dev->ifindex; + + null_or_orig = NULL; + orig_dev = skb->dev; + if (orig_dev->master) { + if (skb_bond_should_drop(skb)) + null_or_orig = orig_dev; /* deliver only exact match */ + else + skb->dev = orig_dev->master; + } + + __get_cpu_var(netdev_rx_stat).total++; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->mac_len = skb->network_header - skb->mac_header; + + pt_prev = NULL; + + rcu_read_lock(); + +#ifdef CONFIG_NET_CLS_ACT + if (skb->tc_verd & TC_NCLS) { + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); + goto ncls; + } +#endif + + list_for_each_entry_rcu(ptype, &ptype_all, list) { + if (ptype->dev == null_or_orig || ptype->dev == skb->dev || + ptype->dev == orig_dev) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + } + +#ifdef CONFIG_NET_CLS_ACT + skb = handle_ing(skb, &pt_prev, &ret, orig_dev); + if (!skb) + goto out; +ncls: +#endif + + skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); + if (!skb) + goto out; + skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); + if (!skb) + goto out; + + type = skb->protocol; + list_for_each_entry_rcu(ptype, + &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { + if (ptype->type == type && + (ptype->dev == null_or_orig || ptype->dev == skb->dev || + ptype->dev == orig_dev)) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + } + + if (pt_prev) { + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + } else { + kfree_skb(skb); + /* Jamal, now you will not able to escape explaining + * me how you were going to use this. :-) + */ + ret = NET_RX_DROP; + } + +out: + rcu_read_unlock(); + return ret; +#else /* DDE_LINUX */ + /* call our callback fn */ + return l4dde26_do_rx_callback(skb); +#endif +} + + +/* Network device is going away, flush any packets still pending */ +static void flush_backlog(void *arg) +{ + struct net_device *dev = arg; + struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) + if (skb->dev == dev) { + __skb_unlink(skb, &queue->input_pkt_queue); + kfree_skb(skb); + } +} + +static int napi_gro_complete(struct sk_buff *skb) +{ + struct packet_type *ptype; + __be16 type = skb->protocol; + struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + int err = -ENOENT; + + if (NAPI_GRO_CB(skb)->count == 1) + goto out; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || ptype->dev || !ptype->gro_complete) + continue; + + err = ptype->gro_complete(skb); + break; + } + rcu_read_unlock(); + + if (err) { + WARN_ON(&ptype->list == head); + kfree_skb(skb); + return NET_RX_SUCCESS; + } + +out: + skb_shinfo(skb)->gso_size = 0; + __skb_push(skb, -skb_network_offset(skb)); + return netif_receive_skb(skb); +} + +void napi_gro_flush(struct napi_struct *napi) +{ + struct sk_buff *skb, *next; + + for (skb = napi->gro_list; skb; skb = next) { + next = skb->next; + skb->next = NULL; + napi_gro_complete(skb); + } + + napi->gro_list = NULL; +} +EXPORT_SYMBOL(napi_gro_flush); + +int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct packet_type *ptype; + __be16 type = skb->protocol; + struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + int count = 0; + int same_flow; + int mac_len; + int free; + + if (!(skb->dev->features & NETIF_F_GRO)) + goto normal; + + if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list) + goto normal; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + struct sk_buff *p; + + if (ptype->type != type || ptype->dev || !ptype->gro_receive) + continue; + + skb_reset_network_header(skb); + mac_len = skb->network_header - skb->mac_header; + skb->mac_len = mac_len; + NAPI_GRO_CB(skb)->same_flow = 0; + NAPI_GRO_CB(skb)->flush = 0; + NAPI_GRO_CB(skb)->free = 0; + + for (p = napi->gro_list; p; p = p->next) { + count++; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + if (p->mac_len != mac_len || + memcmp(skb_mac_header(p), skb_mac_header(skb), + mac_len)) + NAPI_GRO_CB(p)->same_flow = 0; + } + + pp = ptype->gro_receive(&napi->gro_list, skb); + break; + } + rcu_read_unlock(); + + if (&ptype->list == head) + goto normal; + + same_flow = NAPI_GRO_CB(skb)->same_flow; + free = NAPI_GRO_CB(skb)->free; + + if (pp) { + struct sk_buff *nskb = *pp; + + *pp = nskb->next; + nskb->next = NULL; + napi_gro_complete(nskb); + count--; + } + + if (same_flow) + goto ok; + + if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) { + __skb_push(skb, -skb_network_offset(skb)); + goto normal; + } + + NAPI_GRO_CB(skb)->count = 1; + skb_shinfo(skb)->gso_size = skb->len; + skb->next = napi->gro_list; + napi->gro_list = skb; + +ok: + return free; + +normal: + return -1; +} +EXPORT_SYMBOL(dev_gro_receive); + +static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff *p; + + for (p = napi->gro_list; p; p = p->next) { + NAPI_GRO_CB(p)->same_flow = 1; + NAPI_GRO_CB(p)->flush = 0; + } + + return dev_gro_receive(napi, skb); +} + +int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + if (netpoll_receive_skb(skb)) + return NET_RX_DROP; + + switch (__napi_gro_receive(napi, skb)) { + case -1: + return netif_receive_skb(skb); + + case 1: + kfree_skb(skb); + break; + } + + return NET_RX_SUCCESS; +} +EXPORT_SYMBOL(napi_gro_receive); + +void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +{ + __skb_pull(skb, skb_headlen(skb)); + skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); + + napi->skb = skb; +} +EXPORT_SYMBOL(napi_reuse_skb); + +struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, + struct napi_gro_fraginfo *info) +{ + struct net_device *dev = napi->dev; + struct sk_buff *skb = napi->skb; + + napi->skb = NULL; + + if (!skb) { + skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); + if (!skb) + goto out; + + skb_reserve(skb, NET_IP_ALIGN); + } + + BUG_ON(info->nr_frags > MAX_SKB_FRAGS); + skb_shinfo(skb)->nr_frags = info->nr_frags; + memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags)); + + skb->data_len = info->len; + skb->len += info->len; + skb->truesize += info->len; + + if (!pskb_may_pull(skb, ETH_HLEN)) { + napi_reuse_skb(napi, skb); + skb = NULL; + goto out; + } + + skb->protocol = eth_type_trans(skb, dev); + + skb->ip_summed = info->ip_summed; + skb->csum = info->csum; + +out: + return skb; +} +EXPORT_SYMBOL(napi_fraginfo_skb); + +int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info) +{ + struct sk_buff *skb = napi_fraginfo_skb(napi, info); + int err = NET_RX_DROP; + + if (!skb) + goto out; + + if (netpoll_receive_skb(skb)) + goto out; + + err = NET_RX_SUCCESS; + + switch (__napi_gro_receive(napi, skb)) { + case -1: + return netif_receive_skb(skb); + + case 0: + goto out; + } + + napi_reuse_skb(napi, skb); + +out: + return err; +} +EXPORT_SYMBOL(napi_gro_frags); + +static int process_backlog(struct napi_struct *napi, int quota) +{ + int work = 0; + struct softnet_data *queue = &__get_cpu_var(softnet_data); + unsigned long start_time = jiffies; + + napi->weight = weight_p; + do { + struct sk_buff *skb; + + local_irq_disable(); + skb = __skb_dequeue(&queue->input_pkt_queue); + if (!skb) { + local_irq_enable(); + napi_complete(napi); + goto out; + } + local_irq_enable(); + + napi_gro_receive(napi, skb); + } while (++work < quota && jiffies == start_time); + + napi_gro_flush(napi); + +out: + return work; +} + +/** + * __napi_schedule - schedule for receive + * @n: entry to schedule + * + * The entry's receive function will be scheduled to run + */ +void __napi_schedule(struct napi_struct *n) +{ + unsigned long flags; + + local_irq_save(flags); + list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__napi_schedule); + +void __napi_complete(struct napi_struct *n) +{ + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + BUG_ON(n->gro_list); + + list_del(&n->poll_list); + smp_mb__before_clear_bit(); + clear_bit(NAPI_STATE_SCHED, &n->state); +} +EXPORT_SYMBOL(__napi_complete); + +void napi_complete(struct napi_struct *n) +{ + unsigned long flags; + + /* + * don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu + */ + if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) + return; + + napi_gro_flush(n); + local_irq_save(flags); + __napi_complete(n); + local_irq_restore(flags); +} +EXPORT_SYMBOL(napi_complete); + +void netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + INIT_LIST_HEAD(&napi->poll_list); + napi->gro_list = NULL; + napi->skb = NULL; + napi->poll = poll; + napi->weight = weight; + list_add(&napi->dev_list, &dev->napi_list); + napi->dev = dev; +#ifdef CONFIG_NETPOLL + spin_lock_init(&napi->poll_lock); + napi->poll_owner = -1; +#endif + set_bit(NAPI_STATE_SCHED, &napi->state); +} +EXPORT_SYMBOL(netif_napi_add); + +void netif_napi_del(struct napi_struct *napi) +{ + struct sk_buff *skb, *next; + + list_del_init(&napi->dev_list); + kfree_skb(napi->skb); + + for (skb = napi->gro_list; skb; skb = next) { + next = skb->next; + skb->next = NULL; + kfree_skb(skb); + } + + napi->gro_list = NULL; +} +EXPORT_SYMBOL(netif_napi_del); + + +static void net_rx_action(struct softirq_action *h) +{ + struct list_head *list = &__get_cpu_var(softnet_data).poll_list; + unsigned long time_limit = jiffies + 2; + int budget = netdev_budget; + void *have; + + local_irq_disable(); + + while (!list_empty(list)) { + struct napi_struct *n; + int work, weight; + + /* If softirq window is exhuasted then punt. + * Allow this to run for 2 jiffies since which will allow + * an average latency of 1.5/HZ. + */ + if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) + goto softnet_break; + + local_irq_enable(); + + /* Even though interrupts have been re-enabled, this + * access is safe because interrupts can only add new + * entries to the tail of this list, and only ->poll() + * calls can remove this head entry from the list. + */ + n = list_entry(list->next, struct napi_struct, poll_list); + + have = netpoll_poll_lock(n); + + weight = n->weight; + + /* This NAPI_STATE_SCHED test is for avoiding a race + * with netpoll's poll_napi(). Only the entity which + * obtains the lock and sees NAPI_STATE_SCHED set will + * actually make the ->poll() call. Therefore we avoid + * accidently calling ->poll() when NAPI is not scheduled. + */ + work = 0; + if (test_bit(NAPI_STATE_SCHED, &n->state)) + work = n->poll(n, weight); + + WARN_ON_ONCE(work > weight); + + budget -= work; + + local_irq_disable(); + + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(work == weight)) { + if (unlikely(napi_disable_pending(n))) + __napi_complete(n); + else + list_move_tail(&n->poll_list, list); + } + + netpoll_poll_unlock(have); + } +out: + local_irq_enable(); + +#ifdef CONFIG_NET_DMA + /* + * There may not be any more sk_buffs coming right now, so push + * any pending DMA copies to hardware + */ + dma_issue_pending_all(); +#endif + + return; + +softnet_break: + __get_cpu_var(netdev_rx_stat).time_squeeze++; + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + goto out; +} + +static gifconf_func_t * gifconf_list [NPROTO]; + +/** + * register_gifconf - register a SIOCGIF handler + * @family: Address family + * @gifconf: Function handler + * + * Register protocol dependent address dumping routines. The handler + * that is passed must not be freed or reused until it has been replaced + * by another handler. + */ +int register_gifconf(unsigned int family, gifconf_func_t * gifconf) +{ + if (family >= NPROTO) + return -EINVAL; + gifconf_list[family] = gifconf; + return 0; +} + + +/* + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb + */ + +static int dev_ifname(struct net *net, struct ifreq __user *arg) +{ + struct net_device *dev; + struct ifreq ifr; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + read_lock(&dev_base_lock); + dev = __dev_get_by_index(net, ifr.ifr_ifindex); + if (!dev) { + read_unlock(&dev_base_lock); + return -ENODEV; + } + + strcpy(ifr.ifr_name, dev->name); + read_unlock(&dev_base_lock); + + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; +} + +/* + * Perform a SIOCGIFCONF call. This structure will change + * size eventually, and there is nothing I can do about it. + * Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(struct net *net, char __user *arg) +{ + struct ifconf ifc; + struct net_device *dev; + char __user *pos; + int len; + int total; + int i; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) + return -EFAULT; + + pos = ifc.ifc_buf; + len = ifc.ifc_len; + + /* + * Loop over the interfaces, and write an info block for each. + */ + + total = 0; + for_each_netdev(net, dev) { + for (i = 0; i < NPROTO; i++) { + if (gifconf_list[i]) { + int done; + if (!pos) + done = gifconf_list[i](dev, NULL, 0); + else + done = gifconf_list[i](dev, pos + total, + len - total); + if (done < 0) + return -EFAULT; + total += done; + } + } + } + + /* + * All done. Write the updated control block back to the caller. + */ + ifc.ifc_len = total; + + /* + * Both BSD and Solaris return 0 here, so we do too. + */ + return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +} + +#ifdef CONFIG_PROC_FS +/* + * This is invoked by the /proc filesystem handler to display a device + * in detail. + */ +void *dev_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(dev_base_lock) +{ + struct net *net = seq_file_net(seq); + loff_t off; + struct net_device *dev; + + read_lock(&dev_base_lock); + if (!*pos) + return SEQ_START_TOKEN; + + off = 1; + for_each_netdev(net, dev) + if (off++ == *pos) + return dev; + + return NULL; +} + +void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + ++*pos; + return v == SEQ_START_TOKEN ? + first_net_device(net) : next_net_device((struct net_device *)v); +} + +void dev_seq_stop(struct seq_file *seq, void *v) + __releases(dev_base_lock) +{ + read_unlock(&dev_base_lock); +} + +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +{ + const struct net_device_stats *stats = dev_get_stats(dev); + + seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " + "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", + dev->name, stats->rx_bytes, stats->rx_packets, + stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, stats->tx_packets, + stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + + stats->tx_aborted_errors + + stats->tx_window_errors + + stats->tx_heartbeat_errors, + stats->tx_compressed); +} + +/* + * Called from the PROCfs module. This now uses the new arbitrary sized + * /proc/net interface to create /proc/net/dev + */ +static int dev_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Inter-| Receive " + " | Transmit\n" + " face |bytes packets errs drop fifo frame " + "compressed multicast|bytes packets errs " + "drop fifo colls carrier compressed\n"); + else + dev_seq_printf_stats(seq, v); + return 0; +} + +static struct netif_rx_stats *softnet_get_online(loff_t *pos) +{ + struct netif_rx_stats *rc = NULL; + + while (*pos < nr_cpu_ids) + if (cpu_online(*pos)) { + rc = &per_cpu(netdev_rx_stat, *pos); + break; + } else + ++*pos; + return rc; +} + +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +{ + return softnet_get_online(pos); +} + +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return softnet_get_online(pos); +} + +static void softnet_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int softnet_seq_show(struct seq_file *seq, void *v) +{ + struct netif_rx_stats *s = v; + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + s->total, s->dropped, s->time_squeeze, 0, + 0, 0, 0, 0, /* was fastroute */ + s->cpu_collision ); + return 0; +} + +static const struct seq_operations dev_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_seq_show, +}; + +static int dev_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dev_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_seq_fops = { + .owner = THIS_MODULE, + .open = dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static const struct seq_operations softnet_seq_ops = { + .start = softnet_seq_start, + .next = softnet_seq_next, + .stop = softnet_seq_stop, + .show = softnet_seq_show, +}; + +static int softnet_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &softnet_seq_ops); +} + +static const struct file_operations softnet_seq_fops = { + .owner = THIS_MODULE, + .open = softnet_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *ptype_get_idx(loff_t pos) +{ + struct packet_type *pt = NULL; + loff_t i = 0; + int t; + + list_for_each_entry_rcu(pt, &ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + for (t = 0; t < PTYPE_HASH_SIZE; t++) { + list_for_each_entry_rcu(pt, &ptype_base[t], list) { + if (i == pos) + return pt; + ++i; + } + } + return NULL; +} + +static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct packet_type *pt; + struct list_head *nxt; + int hash; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ptype_get_idx(0); + + pt = v; + nxt = pt->list.next; + if (pt->type == htons(ETH_P_ALL)) { + if (nxt != &ptype_all) + goto found; + hash = 0; + nxt = ptype_base[0].next; + } else + hash = ntohs(pt->type) & PTYPE_HASH_MASK; + + while (nxt == &ptype_base[hash]) { + if (++hash >= PTYPE_HASH_SIZE) + return NULL; + nxt = ptype_base[hash].next; + } +found: + return list_entry(nxt, struct packet_type, list); +} + +static void ptype_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int ptype_seq_show(struct seq_file *seq, void *v) +{ + struct packet_type *pt = v; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Type Device Function\n"); + else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + if (pt->type == htons(ETH_P_ALL)) + seq_puts(seq, "ALL "); + else + seq_printf(seq, "%04x", ntohs(pt->type)); + + seq_printf(seq, " %-8s %pF\n", + pt->dev ? pt->dev->name : "", pt->func); + } + + return 0; +} + +static const struct seq_operations ptype_seq_ops = { + .start = ptype_seq_start, + .next = ptype_seq_next, + .stop = ptype_seq_stop, + .show = ptype_seq_show, +}; + +static int ptype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ptype_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations ptype_seq_fops = { + .owner = THIS_MODULE, + .open = ptype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + + +static int __net_init dev_proc_net_init(struct net *net) +{ + int rc = -ENOMEM; + + if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) + goto out; + if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) + goto out_dev; + if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) + goto out_softnet; + + if (wext_proc_init(net)) + goto out_ptype; + rc = 0; +out: + return rc; +out_ptype: + proc_net_remove(net, "ptype"); +out_softnet: + proc_net_remove(net, "softnet_stat"); +out_dev: + proc_net_remove(net, "dev"); + goto out; +} + +static void __net_exit dev_proc_net_exit(struct net *net) +{ + wext_proc_exit(net); + + proc_net_remove(net, "ptype"); + proc_net_remove(net, "softnet_stat"); + proc_net_remove(net, "dev"); +} + +static struct pernet_operations __net_initdata dev_proc_ops = { + .init = dev_proc_net_init, + .exit = dev_proc_net_exit, +}; + +static int __init dev_proc_init(void) +{ + return register_pernet_subsys(&dev_proc_ops); +} +#else +#define dev_proc_init() 0 +#endif /* CONFIG_PROC_FS */ + + +/** + * netdev_set_master - set up master/slave pair + * @slave: slave device + * @master: new master device + * + * Changes the master device of the slave. Pass %NULL to break the + * bonding. The caller must hold the RTNL semaphore. On a failure + * a negative errno code is returned. On success the reference counts + * are adjusted, %RTM_NEWLINK is sent to the routing socket and the + * function returns zero. + */ +int netdev_set_master(struct net_device *slave, struct net_device *master) +{ + struct net_device *old = slave->master; + + ASSERT_RTNL(); + + if (master) { + if (old) + return -EBUSY; + dev_hold(master); + } + + slave->master = master; + + synchronize_net(); + + if (old) + dev_put(old); + + if (master) + slave->flags |= IFF_SLAVE; + else + slave->flags &= ~IFF_SLAVE; + + rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); + return 0; +} + +static void dev_change_rx_flags(struct net_device *dev, int flags) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) + ops->ndo_change_rx_flags(dev, flags); +} + +static int __dev_set_promiscuity(struct net_device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + uid_t uid; + gid_t gid; + + ASSERT_RTNL(); + + dev->flags |= IFF_PROMISC; + dev->promiscuity += inc; + if (dev->promiscuity == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch promisc and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_PROMISC; + else { + dev->promiscuity -= inc; + printk(KERN_WARNING "%s: promiscuity touches roof, " + "set promiscuity failed, promiscuity feature " + "of device might be broken.\n", dev->name); + return -EOVERFLOW; + } + } + if (dev->flags != old_flags) { + printk(KERN_INFO "device %s %s promiscuous mode\n", + dev->name, (dev->flags & IFF_PROMISC) ? "entered" : + "left"); + if (audit_enabled) { + current_uid_gid(&uid, &gid); + audit_log(current->audit_context, GFP_ATOMIC, + AUDIT_ANOM_PROMISCUOUS, + "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", + dev->name, (dev->flags & IFF_PROMISC), + (old_flags & IFF_PROMISC), + audit_get_loginuid(current), + uid, gid, + audit_get_sessionid(current)); + } + + dev_change_rx_flags(dev, IFF_PROMISC); + } + return 0; +} + +/** + * dev_set_promiscuity - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promiscuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + * Return 0 if successful or a negative errno code on error. + */ +int dev_set_promiscuity(struct net_device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + int err; + + err = __dev_set_promiscuity(dev, inc); + if (err < 0) + return err; + if (dev->flags != old_flags) + dev_set_rx_mode(dev); + return err; +} + +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + + ASSERT_RTNL(); + + dev->flags |= IFF_ALLMULTI; + dev->allmulti += inc; + if (dev->allmulti == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch allmulti and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_ALLMULTI; + else { + dev->allmulti -= inc; + printk(KERN_WARNING "%s: allmulti touches roof, " + "set allmulti failed, allmulti feature of " + "device might be broken.\n", dev->name); + return -EOVERFLOW; + } + } + if (dev->flags ^ old_flags) { + dev_change_rx_flags(dev, IFF_ALLMULTI); + dev_set_rx_mode(dev); + } + return 0; +} + +/* + * Upload unicast and multicast address lists to device and + * configure RX filtering. When the device doesn't support unicast + * filtering it is put in promiscuous mode while unicast addresses + * are present. + */ +void __dev_set_rx_mode(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + /* dev_open will call this function so the list will stay sane. */ + if (!(dev->flags&IFF_UP)) + return; + + if (!netif_device_present(dev)) + return; + + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); + else { + /* Unicast addresses changes may only happen under the rtnl, + * therefore calling __dev_set_promiscuity here is safe. + */ + if (dev->uc_count > 0 && !dev->uc_promisc) { + __dev_set_promiscuity(dev, 1); + dev->uc_promisc = 1; + } else if (dev->uc_count == 0 && dev->uc_promisc) { + __dev_set_promiscuity(dev, -1); + dev->uc_promisc = 0; + } + + if (ops->ndo_set_multicast_list) + ops->ndo_set_multicast_list(dev); + } +} + +void dev_set_rx_mode(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); +} + +int __dev_addr_delete(struct dev_addr_list **list, int *count, + void *addr, int alen, int glbl) +{ + struct dev_addr_list *da; + + for (; (da = *list) != NULL; list = &da->next) { + if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && + alen == da->da_addrlen) { + if (glbl) { + int old_glbl = da->da_gusers; + da->da_gusers = 0; + if (old_glbl == 0) + break; + } + if (--da->da_users) + return 0; + + *list = da->next; + kfree(da); + (*count)--; + return 0; + } + } + return -ENOENT; +} + +int __dev_addr_add(struct dev_addr_list **list, int *count, + void *addr, int alen, int glbl) +{ + struct dev_addr_list *da; + + for (da = *list; da != NULL; da = da->next) { + if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && + da->da_addrlen == alen) { + if (glbl) { + int old_glbl = da->da_gusers; + da->da_gusers = 1; + if (old_glbl) + return 0; + } + da->da_users++; + return 0; + } + } + + da = kzalloc(sizeof(*da), GFP_ATOMIC); + if (da == NULL) + return -ENOMEM; + memcpy(da->da_addr, addr, alen); + da->da_addrlen = alen; + da->da_users = 1; + da->da_gusers = glbl ? 1 : 0; + da->next = *list; + *list = da; + (*count)++; + return 0; +} + +/** + * dev_unicast_delete - Release secondary unicast address. + * @dev: device + * @addr: address to delete + * @alen: length of @addr + * + * Release reference to a secondary unicast address and remove it + * from the device if the reference count drops to zero. + * + * The caller must hold the rtnl_mutex. + */ +int dev_unicast_delete(struct net_device *dev, void *addr, int alen) +{ + int err; + + ASSERT_RTNL(); + + netif_addr_lock_bh(dev); + err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_unicast_delete); + +/** + * dev_unicast_add - add a secondary unicast address + * @dev: device + * @addr: address to add + * @alen: length of @addr + * + * Add a secondary unicast address to the device or increase + * the reference count if it already exists. + * + * The caller must hold the rtnl_mutex. + */ +int dev_unicast_add(struct net_device *dev, void *addr, int alen) +{ + int err; + + ASSERT_RTNL(); + + netif_addr_lock_bh(dev); + err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_unicast_add); + +int __dev_addr_sync(struct dev_addr_list **to, int *to_count, + struct dev_addr_list **from, int *from_count) +{ + struct dev_addr_list *da, *next; + int err = 0; + + da = *from; + while (da != NULL) { + next = da->next; + if (!da->da_synced) { + err = __dev_addr_add(to, to_count, + da->da_addr, da->da_addrlen, 0); + if (err < 0) + break; + da->da_synced = 1; + da->da_users++; + } else if (da->da_users == 1) { + __dev_addr_delete(to, to_count, + da->da_addr, da->da_addrlen, 0); + __dev_addr_delete(from, from_count, + da->da_addr, da->da_addrlen, 0); + } + da = next; + } + return err; +} + +void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, + struct dev_addr_list **from, int *from_count) +{ + struct dev_addr_list *da, *next; + + da = *from; + while (da != NULL) { + next = da->next; + if (da->da_synced) { + __dev_addr_delete(to, to_count, + da->da_addr, da->da_addrlen, 0); + da->da_synced = 0; + __dev_addr_delete(from, from_count, + da->da_addr, da->da_addrlen, 0); + } + da = next; + } +} + +/** + * dev_unicast_sync - Synchronize device's unicast list to another device + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_addr_lock_bh. + * + * This function is intended to be called from the dev->set_rx_mode + * function of layered software devices. + */ +int dev_unicast_sync(struct net_device *to, struct net_device *from) +{ + int err = 0; + + netif_addr_lock_bh(to); + err = __dev_addr_sync(&to->uc_list, &to->uc_count, + &from->uc_list, &from->uc_count); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock_bh(to); + return err; +} +EXPORT_SYMBOL(dev_unicast_sync); + +/** + * dev_unicast_unsync - Remove synchronized addresses from the destination device + * @to: destination device + * @from: source device + * + * Remove all addresses that were added to the destination device by + * dev_unicast_sync(). This function is intended to be called from the + * dev->stop function of layered software devices. + */ +void dev_unicast_unsync(struct net_device *to, struct net_device *from) +{ + netif_addr_lock_bh(from); + netif_addr_lock(to); + + __dev_addr_unsync(&to->uc_list, &to->uc_count, + &from->uc_list, &from->uc_count); + __dev_set_rx_mode(to); + + netif_addr_unlock(to); + netif_addr_unlock_bh(from); +} +EXPORT_SYMBOL(dev_unicast_unsync); + +static void __dev_addr_discard(struct dev_addr_list **list) +{ + struct dev_addr_list *tmp; + + while (*list != NULL) { + tmp = *list; + *list = tmp->next; + if (tmp->da_users > tmp->da_gusers) + printk("__dev_addr_discard: address leakage! " + "da_users=%d\n", tmp->da_users); + kfree(tmp); + } +} + +static void dev_addr_discard(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + + __dev_addr_discard(&dev->uc_list); + dev->uc_count = 0; + + __dev_addr_discard(&dev->mc_list); + dev->mc_count = 0; + + netif_addr_unlock_bh(dev); +} + +/** + * dev_get_flags - get flags reported to userspace + * @dev: device + * + * Get the combination of flag bits exported through APIs to userspace. + */ +unsigned dev_get_flags(const struct net_device *dev) +{ + unsigned flags; + + flags = (dev->flags & ~(IFF_PROMISC | + IFF_ALLMULTI | + IFF_RUNNING | + IFF_LOWER_UP | + IFF_DORMANT)) | + (dev->gflags & (IFF_PROMISC | + IFF_ALLMULTI)); + + if (netif_running(dev)) { + if (netif_oper_up(dev)) + flags |= IFF_RUNNING; + if (netif_carrier_ok(dev)) + flags |= IFF_LOWER_UP; + if (netif_dormant(dev)) + flags |= IFF_DORMANT; + } + + return flags; +} + +/** + * dev_change_flags - change device settings + * @dev: device + * @flags: device state flags + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + */ +int dev_change_flags(struct net_device *dev, unsigned flags) +{ + int ret, changes; + int old_flags = dev->flags; + + ASSERT_RTNL(); + + /* + * Set the flags on our device. + */ + + dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | + IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | + IFF_AUTOMEDIA)) | + (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | + IFF_ALLMULTI)); + + /* + * Load in the correct multicast list now the flags have changed. + */ + + if ((old_flags ^ flags) & IFF_MULTICAST) + dev_change_rx_flags(dev, IFF_MULTICAST); + + dev_set_rx_mode(dev); + + /* + * Have we downed the interface. We handle IFF_UP ourselves + * according to user attempts to set it, rather than blindly + * setting it. + */ + + ret = 0; + if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ + ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); + + if (!ret) + dev_set_rx_mode(dev); + } + + if (dev->flags & IFF_UP && + ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | + IFF_VOLATILE))) + call_netdevice_notifiers(NETDEV_CHANGE, dev); + + if ((flags ^ dev->gflags) & IFF_PROMISC) { + int inc = (flags & IFF_PROMISC) ? +1 : -1; + dev->gflags ^= IFF_PROMISC; + dev_set_promiscuity(dev, inc); + } + + /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI + is important. Some (broken) drivers set IFF_PROMISC, when + IFF_ALLMULTI is requested not asking us and not reporting. + */ + if ((flags ^ dev->gflags) & IFF_ALLMULTI) { + int inc = (flags & IFF_ALLMULTI) ? +1 : -1; + dev->gflags ^= IFF_ALLMULTI; + dev_set_allmulti(dev, inc); + } + + /* Exclude state transition flags, already notified */ + changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING); + if (changes) + rtmsg_ifinfo(RTM_NEWLINK, dev, changes); + + return ret; +} + +/** + * dev_set_mtu - Change maximum transfer unit + * @dev: device + * @new_mtu: new transfer unit + * + * Change the maximum transfer size of the network device. + */ +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + if (new_mtu == dev->mtu) + return 0; + + /* MTU must be positive. */ + if (new_mtu < 0) + return -EINVAL; + + if (!netif_device_present(dev)) + return -ENODEV; + + err = 0; + if (ops->ndo_change_mtu) + err = ops->ndo_change_mtu(dev, new_mtu); + else + dev->mtu = new_mtu; + + if (!err && dev->flags & IFF_UP) + call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + return err; +} + +/** + * dev_set_mac_address - Change Media Access Control Address + * @dev: device + * @sa: new address + * + * Change the hardware (MAC) address of the device + */ +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + if (!ops->ndo_set_mac_address) + return -EOPNOTSUPP; + if (sa->sa_family != dev->type) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + err = ops->ndo_set_mac_address(dev, sa); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} + +/* + * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock) + */ +static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + + if (!dev) + return -ENODEV; + + switch (cmd) { + case SIOCGIFFLAGS: /* Get interface flags */ + ifr->ifr_flags = dev_get_flags(dev); + return 0; + + case SIOCGIFMETRIC: /* Get the metric on the interface + (currently unused) */ + ifr->ifr_metric = 0; + return 0; + + case SIOCGIFMTU: /* Get the MTU of a device */ + ifr->ifr_mtu = dev->mtu; + return 0; + + case SIOCGIFHWADDR: + if (!dev->addr_len) + memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); + else + memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + ifr->ifr_hwaddr.sa_family = dev->type; + return 0; + + case SIOCGIFSLAVE: + err = -EINVAL; + break; + + case SIOCGIFMAP: + ifr->ifr_map.mem_start = dev->mem_start; + ifr->ifr_map.mem_end = dev->mem_end; + ifr->ifr_map.base_addr = dev->base_addr; + ifr->ifr_map.irq = dev->irq; + ifr->ifr_map.dma = dev->dma; + ifr->ifr_map.port = dev->if_port; + return 0; + + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; + + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + default: + /* dev_ioctl() should ensure this case + * is never reached + */ + WARN_ON(1); + err = -EINVAL; + break; + + } + return err; +} + +/* + * Perform the SIOCxIFxxx calls, inside rtnl_lock() + */ +static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + const struct net_device_ops *ops; + + if (!dev) + return -ENODEV; + + ops = dev->netdev_ops; + + switch (cmd) { + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags); + + case SIOCSIFMETRIC: /* Set the metric on the interface + (currently unused) */ + return -EOPNOTSUPP; + + case SIOCSIFMTU: /* Set the MTU of a device */ + return dev_set_mtu(dev, ifr->ifr_mtu); + + case SIOCSIFHWADDR: + return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + + case SIOCSIFHWBROADCAST: + if (ifr->ifr_hwaddr.sa_family != dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return 0; + + case SIOCSIFMAP: + if (ops->ndo_set_config) { + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_set_config(dev, &ifr->ifr_map); + } + return -EOPNOTSUPP; + + case SIOCADDMULTI: + if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, + dev->addr_len, 1); + + case SIOCDELMULTI: + if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, + dev->addr_len, 1); + + case SIOCSIFTXQLEN: + if (ifr->ifr_qlen < 0) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; + + case SIOCSIFNAME: + ifr->ifr_newname[IFNAMSIZ-1] = '\0'; + return dev_change_name(dev, ifr->ifr_newname); + + /* + * Unknown or private ioctl + */ + + default: + if ((cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) || + cmd == SIOCBONDENSLAVE || + cmd == SIOCBONDRELEASE || + cmd == SIOCBONDSETHWADDR || + cmd == SIOCBONDSLAVEINFOQUERY || + cmd == SIOCBONDINFOQUERY || + cmd == SIOCBONDCHANGEACTIVE || + cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCBRADDIF || + cmd == SIOCBRDELIF || + cmd == SIOCWANDEV) { + err = -EOPNOTSUPP; + if (ops->ndo_do_ioctl) { + if (netif_device_present(dev)) + err = ops->ndo_do_ioctl(dev, ifr, cmd); + else + err = -ENODEV; + } + } else + err = -EINVAL; + + } + return err; +} + +/* + * This function handles all "interface"-type I/O control requests. The actual + * 'doing' part of this is dev_ifsioc above. + */ + +/** + * dev_ioctl - network device ioctl + * @net: the applicable net namespace + * @cmd: command to issue + * @arg: pointer to a struct ifreq in user space + * + * Issue ioctl functions to devices. This is normally called by the + * user space syscall interfaces but can sometimes be useful for + * other purposes. The return value is the return from the syscall if + * positive or a negative errno code on error. + */ + +int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + int ret; + char *colon; + + /* One special case: SIOCGIFCONF takes ifconf argument + and requires shared lock, because it sleeps writing + to user space. + */ + + if (cmd == SIOCGIFCONF) { + rtnl_lock(); + ret = dev_ifconf(net, (char __user *) arg); + rtnl_unlock(); + return ret; + } + if (cmd == SIOCGIFNAME) + return dev_ifname(net, (struct ifreq __user *)arg); + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + /* + * See which interface the caller is talking about. + */ + + switch (cmd) { + /* + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value + */ + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFHWADDR: + case SIOCGIFSLAVE: + case SIOCGIFMAP: + case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(net, ifr.ifr_name); + read_lock(&dev_base_lock); + ret = dev_ifsioc_locked(net, &ifr, cmd); + read_unlock(&dev_base_lock); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + case SIOCETHTOOL: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ethtool(net, &ifr); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - return a value + */ + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSIFNAME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMTU: + case SIOCSIFMAP: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: + case SIOCSIFTXQLEN: + case SIOCSMIIREG: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDCHANGEACTIVE: + case SIOCBRADDIF: + case SIOCBRDELIF: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + /* fall through */ + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + return ret; + + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but + * currently do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. + * Not applicable in our case */ + case SIOCSIFLINK: + return -EINVAL; + + /* + * Unknown or private ioctl. + */ + default: + if (cmd == SIOCWANDEV || + (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15)) { + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + return ret; + } + /* Take care of Wireless Extensions */ + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) + return wext_handle_ioctl(net, &ifr, cmd, arg); + return -EINVAL; + } +} + + +/** + * dev_new_index - allocate an ifindex + * @net: the applicable net namespace + * + * Returns a suitable unique value for a new device interface + * number. The caller must hold the rtnl semaphore or the + * dev_base_lock to be sure it remains unique. + */ +static int dev_new_index(struct net *net) +{ + static int ifindex; + for (;;) { + if (++ifindex <= 0) + ifindex = 1; + if (!__dev_get_by_index(net, ifindex)) + return ifindex; + } +} + +/* Delayed registration/unregisteration */ +static LIST_HEAD(net_todo_list); + +static void net_set_todo(struct net_device *dev) +{ + list_add_tail(&dev->todo_list, &net_todo_list); +} + +static void rollback_registered(struct net_device *dev) +{ + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + /* Some devices call without registering for initialization unwind. */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " + "was registered\n", dev->name, dev); + + WARN_ON(1); + return; + } + + BUG_ON(dev->reg_state != NETREG_REGISTERED); + + /* If device is running, close it first. */ + dev_close(dev); + + /* And unlink it from device chain. */ + unlist_netdevice(dev); + + dev->reg_state = NETREG_UNREGISTERING; + + synchronize_net(); + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + /* + * Flush the unicast and multicast chains + */ + dev_addr_discard(dev); + + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + + /* Notifier chain MUST detach us from master device. */ + WARN_ON(dev->master); + + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); + + synchronize_net(); + + dev_put(dev); +} + +static void __netdev_init_queue_locks_one(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_unused) +{ + spin_lock_init(&dev_queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); + dev_queue->xmit_lock_owner = -1; +} + +static void netdev_init_queue_locks(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); + __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); +} + +unsigned long netdev_fix_features(unsigned long features, const char *name) +{ + /* Fix illegal SG+CSUM combinations. */ + if ((features & NETIF_F_SG) && + !(features & NETIF_F_ALL_CSUM)) { + if (name) + printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " + "checksum feature.\n", name); + features &= ~NETIF_F_SG; + } + + /* TSO requires that SG is present as well. */ + if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { + if (name) + printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " + "SG feature.\n", name); + features &= ~NETIF_F_TSO; + } + + if (features & NETIF_F_UFO) { + if (!(features & NETIF_F_GEN_CSUM)) { + if (name) + printk(KERN_ERR "%s: Dropping NETIF_F_UFO " + "since no NETIF_F_HW_CSUM feature.\n", + name); + features &= ~NETIF_F_UFO; + } + + if (!(features & NETIF_F_SG)) { + if (name) + printk(KERN_ERR "%s: Dropping NETIF_F_UFO " + "since no NETIF_F_SG feature.\n", name); + features &= ~NETIF_F_UFO; + } + } + + return features; +} +EXPORT_SYMBOL(netdev_fix_features); + +/* Some devices need to (re-)set their netdev_ops inside + * ->init() or similar. If that happens, we have to setup + * the compat pointers again. + */ +void netdev_resync_ops(struct net_device *dev) +{ +#ifdef CONFIG_COMPAT_NET_DEV_OPS + const struct net_device_ops *ops = dev->netdev_ops; + + dev->init = ops->ndo_init; + dev->uninit = ops->ndo_uninit; + dev->open = ops->ndo_open; + dev->change_rx_flags = ops->ndo_change_rx_flags; + dev->set_rx_mode = ops->ndo_set_rx_mode; + dev->set_multicast_list = ops->ndo_set_multicast_list; + dev->set_mac_address = ops->ndo_set_mac_address; + dev->validate_addr = ops->ndo_validate_addr; + dev->do_ioctl = ops->ndo_do_ioctl; + dev->set_config = ops->ndo_set_config; + dev->change_mtu = ops->ndo_change_mtu; + dev->neigh_setup = ops->ndo_neigh_setup; + dev->tx_timeout = ops->ndo_tx_timeout; + dev->get_stats = ops->ndo_get_stats; + dev->vlan_rx_register = ops->ndo_vlan_rx_register; + dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; + dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; +#ifdef CONFIG_NET_POLL_CONTROLLER + dev->poll_controller = ops->ndo_poll_controller; +#endif +#endif +} +EXPORT_SYMBOL(netdev_resync_ops); + +/** + * register_netdevice - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * Callers must hold the rtnl semaphore. You may want + * register_netdev() instead of this. + * + * BUGS: + * The locking appears insufficient to guarantee two parallel registers + * will not get the same name. + */ + +int register_netdevice(struct net_device *dev) +{ + struct hlist_head *head; + struct hlist_node *p; + int ret; + struct net *net = dev_net(dev); + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + might_sleep(); + + /* When net_device's are persistent, this will be fatal. */ + BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + BUG_ON(!net); + + spin_lock_init(&dev->addr_list_lock); + netdev_set_addr_lockdep_class(dev); + netdev_init_queue_locks(dev); + + dev->iflink = -1; + +#ifdef CONFIG_COMPAT_NET_DEV_OPS + /* Netdevice_ops API compatiability support. + * This is temporary until all network devices are converted. + */ + if (dev->netdev_ops) { + netdev_resync_ops(dev); + } else { + char drivername[64]; + pr_info("%s (%s): not using net_device_ops yet\n", + dev->name, netdev_drivername(dev, drivername, 64)); + + /* This works only because net_device_ops and the + compatiablity structure are the same. */ + dev->netdev_ops = (void *) &(dev->init); + } +#endif + + /* Init, if this function is available */ + if (dev->netdev_ops->ndo_init) { + ret = dev->netdev_ops->ndo_init(dev); + if (ret) { + if (ret > 0) + ret = -EIO; + goto out; + } + } + + if (!dev_valid_name(dev->name)) { + ret = -EINVAL; + goto err_uninit; + } + + dev->ifindex = dev_new_index(net); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + + /* Check for existence of name */ + head = dev_name_hash(net, dev->name); + hlist_for_each(p, head) { + struct net_device *d + = hlist_entry(p, struct net_device, name_hlist); + if (!strncmp(d->name, dev->name, IFNAMSIZ)) { + ret = -EEXIST; + goto err_uninit; + } + } + + /* Fix illegal checksum combinations */ + if ((dev->features & NETIF_F_HW_CSUM) && + (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", + dev->name); + dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); + } + + if ((dev->features & NETIF_F_NO_CSUM) && + (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", + dev->name); + dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); + } + + dev->features = netdev_fix_features(dev->features, dev->name); + + /* Enable software GSO if SG is supported. */ + if (dev->features & NETIF_F_SG) + dev->features |= NETIF_F_GSO; + + netdev_initialize_kobject(dev); + ret = netdev_register_kobject(dev); + if (ret) + goto err_uninit; + dev->reg_state = NETREG_REGISTERED; + + /* + * Default initial state at registry is that the + * device is present. + */ + + set_bit(__LINK_STATE_PRESENT, &dev->state); + + dev_init_scheduler(dev); + dev_hold(dev); + list_netdevice(dev); + + /* Notify protocols, that a new device appeared. */ + ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + ret = notifier_to_errno(ret); + if (ret) { + rollback_registered(dev); + dev->reg_state = NETREG_UNREGISTERED; + } + +out: + return ret; + +err_uninit: + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + goto out; +} + +/** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initialize the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +int init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * are they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + + /* make sure we BUG if trying to hit standard + * register/unregister code path + */ + dev->reg_state = NETREG_DUMMY; + + /* initialize the ref count */ + atomic_set(&dev->refcnt, 1); + + /* NAPI wants this */ + INIT_LIST_HEAD(&dev->napi_list); + + /* a dummy interface is started by default */ + set_bit(__LINK_STATE_PRESENT, &dev->state); + set_bit(__LINK_STATE_START, &dev->state); + + return 0; +} +EXPORT_SYMBOL_GPL(init_dummy_netdev); + + +/** + * register_netdev - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * This is a wrapper around register_netdevice that takes the rtnl semaphore + * and expands the device name if you passed a format string to + * alloc_netdev. + */ +int register_netdev(struct net_device *dev) +{ + int err; + + rtnl_lock(); + + /* + * If the name is a format string the caller wants us to do a + * name allocation. + */ + if (strchr(dev->name, '%')) { + err = dev_alloc_name(dev, dev->name); + if (err < 0) + goto out; + } + + err = register_netdevice(dev); +out: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(register_netdev); + +/* + * netdev_wait_allrefs - wait until all references are gone. + * + * This is called when unregistering network devices. + * + * Any protocol or device that holds a reference should register + * for netdevice notification, and cleanup and put back the + * reference if they receive an UNREGISTER event. + * We can get stuck here if buggy protocols don't correctly + * call dev_put. + */ +static void netdev_wait_allrefs(struct net_device *dev) +{ + unsigned long rebroadcast_time, warning_time; + + rebroadcast_time = warning_time = jiffies; + while (atomic_read(&dev->refcnt) != 0) { + if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { + rtnl_lock(); + + /* Rebroadcast unregister notification */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + } + + __rtnl_unlock(); + + rebroadcast_time = jiffies; + } + + msleep(250); + + if (time_after(jiffies, warning_time + 10 * HZ)) { + printk(KERN_EMERG "unregister_netdevice: " + "waiting for %s to become free. Usage " + "count = %d\n", + dev->name, atomic_read(&dev->refcnt)); + warning_time = jiffies; + } + } +} + +/* The sequence is: + * + * rtnl_lock(); + * ... + * register_netdevice(x1); + * register_netdevice(x2); + * ... + * unregister_netdevice(y1); + * unregister_netdevice(y2); + * ... + * rtnl_unlock(); + * free_netdev(y1); + * free_netdev(y2); + * + * We are invoked by rtnl_unlock(). + * This allows us to deal with problems: + * 1) We can delete sysfs objects which invoke hotplug + * without deadlocking with linkwatch via keventd. + * 2) Since we run with the RTNL semaphore not held, we can sleep + * safely in order to wait for the netdev refcnt to drop to zero. + * + * We must not return until all unregister events added during + * the interval the lock was held have been completed. + */ +void netdev_run_todo(void) +{ + struct list_head list; + + /* Snapshot list, allow later requests */ + list_replace_init(&net_todo_list, &list); + + __rtnl_unlock(); + + while (!list_empty(&list)) { + struct net_device *dev + = list_entry(list.next, struct net_device, todo_list); + list_del(&dev->todo_list); + + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { + printk(KERN_ERR "network todo '%s' but state %d\n", + dev->name, dev->reg_state); + dump_stack(); + continue; + } + + dev->reg_state = NETREG_UNREGISTERED; + + on_each_cpu(flush_backlog, dev, 1); + + netdev_wait_allrefs(dev); + + /* paranoia */ + BUG_ON(atomic_read(&dev->refcnt)); + WARN_ON(dev->ip_ptr); + WARN_ON(dev->ip6_ptr); + WARN_ON(dev->dn_ptr); + + if (dev->destructor) + dev->destructor(dev); + + /* Free network device */ + kobject_put(&dev->dev.kobj); + } +} + +/** + * dev_get_stats - get network device statistics + * @dev: device to get statistics from + * + * Get network statistics from device. The device driver may provide + * its own method by setting dev->netdev_ops->get_stats; otherwise + * the internal statistics structure is used. + */ +const struct net_device_stats *dev_get_stats(struct net_device *dev) + { + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_get_stats) + return ops->ndo_get_stats(dev); + else + return &dev->stats; +} +EXPORT_SYMBOL(dev_get_stats); + +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue, + void *_unused) +{ + queue->dev = dev; +} + +static void netdev_init_queues(struct net_device *dev) +{ + netdev_init_one_queue(dev, &dev->rx_queue, NULL); + netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); + spin_lock_init(&dev->tx_global_lock); +} + +/** + * alloc_netdev_mq - allocate network device + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @setup: callback to initialize device + * @queue_count: the number of subqueues to allocate + * + * Allocates a struct net_device with private data area for driver use + * and performs basic initialization. Also allocates subquue structs + * for each queue on the device at the end of the netdevice. + */ +struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), unsigned int queue_count) +{ + struct netdev_queue *tx; + struct net_device *dev; + size_t alloc_size; + void *p; + + BUG_ON(strlen(name) >= sizeof(dev->name)); + + alloc_size = sizeof(struct net_device); + if (sizeof_priv) { + /* ensure 32-byte alignment of private area */ + alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; + alloc_size += sizeof_priv; + } + /* ensure 32-byte alignment of whole construct */ + alloc_size += NETDEV_ALIGN_CONST; + + p = kzalloc(alloc_size, GFP_KERNEL); + if (!p) { + printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); + return NULL; + } + + tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); + if (!tx) { + printk(KERN_ERR "alloc_netdev: Unable to allocate " + "tx qdiscs.\n"); + kfree(p); + return NULL; + } + + dev = (struct net_device *) + (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); + dev->padded = (char *)dev - (char *)p; + dev_net_set(dev, &init_net); + + dev->_tx = tx; + dev->num_tx_queues = queue_count; + dev->real_num_tx_queues = queue_count; + + dev->gso_max_size = GSO_MAX_SIZE; + + netdev_init_queues(dev); + + INIT_LIST_HEAD(&dev->napi_list); + setup(dev); + strcpy(dev->name, name); + return dev; +} +EXPORT_SYMBOL(alloc_netdev_mq); + +/** + * free_netdev - free network device + * @dev: device + * + * This function does the last stage of destroying an allocated device + * interface. The reference to the device object is released. + * If this is the last reference then it will be freed. + */ +void free_netdev(struct net_device *dev) +{ + struct napi_struct *p, *n; + + release_net(dev_net(dev)); + + kfree(dev->_tx); + + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) + netif_napi_del(p); + + /* Compatibility with error handling in drivers */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + kfree((char *)dev - dev->padded); + return; + } + + BUG_ON(dev->reg_state != NETREG_UNREGISTERED); + dev->reg_state = NETREG_RELEASED; + + /* will free via device release */ + put_device(&dev->dev); +} + +/** + * synchronize_net - Synchronize with packet receive processing + * + * Wait for packets currently being received to be done. + * Does not block later packets from starting. + */ +void synchronize_net(void) +{ + might_sleep(); +#ifndef DDE_LINUX + synchronize_rcu(); +#endif +} + +/** + * unregister_netdevice - remove device from the kernel + * @dev: device + * + * This function shuts down a device interface and removes it + * from the kernel tables. + * + * Callers must hold the rtnl semaphore. You may want + * unregister_netdev() instead of this. + */ + +void unregister_netdevice(struct net_device *dev) +{ + ASSERT_RTNL(); + + rollback_registered(dev); + /* Finish processing unregister after unlock */ + net_set_todo(dev); +} + +/** + * unregister_netdev - remove device from the kernel + * @dev: device + * + * This function shuts down a device interface and removes it + * from the kernel tables. + * + * This is just a wrapper for unregister_netdevice that takes + * the rtnl semaphore. In general you want to use this and not + * unregister_netdevice. + */ +void unregister_netdev(struct net_device *dev) +{ + rtnl_lock(); + unregister_netdevice(dev); + rtnl_unlock(); +} + +EXPORT_SYMBOL(unregister_netdev); + +/** + * dev_change_net_namespace - move device to different nethost namespace + * @dev: device + * @net: network namespace + * @pat: If not NULL name pattern to try if the current device name + * is already taken in the destination network namespace. + * + * This function shuts down a device interface and moves it + * to a new network namespace. On success 0 is returned, on + * a failure a netagive errno code is returned. + * + * Callers must hold the rtnl semaphore. + */ + +int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +{ + char buf[IFNAMSIZ]; + const char *destname; + int err; + + ASSERT_RTNL(); + + /* Don't allow namespace local devices to be moved. */ + err = -EINVAL; + if (dev->features & NETIF_F_NETNS_LOCAL) + goto out; + +#ifdef CONFIG_SYSFS + /* Don't allow real devices to be moved when sysfs + * is enabled. + */ + err = -EINVAL; + if (dev->dev.parent) + goto out; +#endif + + /* Ensure the device has been registrered */ + err = -EINVAL; + if (dev->reg_state != NETREG_REGISTERED) + goto out; + + /* Get out if there is nothing todo */ + err = 0; + if (net_eq(dev_net(dev), net)) + goto out; + + /* Pick the destination device name, and ensure + * we can use it in the destination network namespace. + */ + err = -EEXIST; + destname = dev->name; + if (__dev_get_by_name(net, destname)) { + /* We get here if we can't use the current device name */ + if (!pat) + goto out; + if (!dev_valid_name(pat)) + goto out; + if (strchr(pat, '%')) { + if (__dev_alloc_name(net, pat, buf) < 0) + goto out; + destname = buf; + } else + destname = pat; + if (__dev_get_by_name(net, destname)) + goto out; + } + + /* + * And now a mini version of register_netdevice unregister_netdevice. + */ + + /* If device is running close it first. */ + dev_close(dev); + + /* And unlink it from device chain */ + err = -ENODEV; + unlist_netdevice(dev); + + synchronize_net(); + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + /* + * Flush the unicast and multicast chains + */ + dev_addr_discard(dev); + + netdev_unregister_kobject(dev); + + /* Actually switch the network namespace */ + dev_net_set(dev, net); + + /* Assign the new device name */ + if (destname != dev->name) + strcpy(dev->name, destname); + + /* If there is an ifindex conflict assign a new one */ + if (__dev_get_by_index(net, dev->ifindex)) { + int iflink = (dev->iflink == dev->ifindex); + dev->ifindex = dev_new_index(net); + if (iflink) + dev->iflink = dev->ifindex; + } + + /* Fixup kobjects */ + err = netdev_register_kobject(dev); + WARN_ON(err); + + /* Add the device back in the hashes */ + list_netdevice(dev); + + /* Notify protocols, that a new device appeared. */ + call_netdevice_notifiers(NETDEV_REGISTER, dev); + + synchronize_net(); + err = 0; +out: + return err; +} + +static int dev_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *ocpu) +{ + struct sk_buff **list_skb; + struct Qdisc **list_net; + struct sk_buff *skb; + unsigned int cpu, oldcpu = (unsigned long)ocpu; + struct softnet_data *sd, *oldsd; + + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) + return NOTIFY_OK; + + local_irq_disable(); + cpu = smp_processor_id(); + sd = &per_cpu(softnet_data, cpu); + oldsd = &per_cpu(softnet_data, oldcpu); + + /* Find end of our completion_queue. */ + list_skb = &sd->completion_queue; + while (*list_skb) + list_skb = &(*list_skb)->next; + /* Append completion queue from offline CPU. */ + *list_skb = oldsd->completion_queue; + oldsd->completion_queue = NULL; + + /* Find end of our output_queue. */ + list_net = &sd->output_queue; + while (*list_net) + list_net = &(*list_net)->next_sched; + /* Append output queue from offline CPU. */ + *list_net = oldsd->output_queue; + oldsd->output_queue = NULL; + + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); + + /* Process offline CPU's input_pkt_queue */ + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) + netif_rx(skb); + + return NOTIFY_OK; +} + + +/** + * netdev_increment_features - increment feature set by one + * @all: current feature set + * @one: new feature set + * @mask: mask feature set + * + * Computes a new feature set after adding a device with feature set + * @one to the master device with current feature set @all. Will not + * enable anything that is off in @mask. Returns the new feature set. + */ +unsigned long netdev_increment_features(unsigned long all, unsigned long one, + unsigned long mask) +{ + /* If device needs checksumming, downgrade to it. */ + if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) + all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); + else if (mask & NETIF_F_ALL_CSUM) { + /* If one device supports v4/v6 checksumming, set for all. */ + if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) && + !(all & NETIF_F_GEN_CSUM)) { + all &= ~NETIF_F_ALL_CSUM; + all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); + } + + /* If one device supports hw checksumming, set for all. */ + if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { + all &= ~NETIF_F_ALL_CSUM; + all |= NETIF_F_HW_CSUM; + } + } + + one |= NETIF_F_ALL_CSUM; + + one |= all & NETIF_F_ONE_FOR_ALL; + all &= one | NETIF_F_LLTX | NETIF_F_GSO; + all |= one & mask & NETIF_F_ONE_FOR_ALL; + + return all; +} +EXPORT_SYMBOL(netdev_increment_features); + +static struct hlist_head *netdev_create_hash(void) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < NETDEV_HASHENTRIES; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + +/* Initialize per network namespace state */ +static int __net_init netdev_init(struct net *net) +{ + INIT_LIST_HEAD(&net->dev_base_head); + + net->dev_name_head = netdev_create_hash(); + if (net->dev_name_head == NULL) + goto err_name; + + net->dev_index_head = netdev_create_hash(); + if (net->dev_index_head == NULL) + goto err_idx; + + return 0; + +err_idx: + kfree(net->dev_name_head); +err_name: + return -ENOMEM; +} + +/** + * netdev_drivername - network driver for the device + * @dev: network device + * @buffer: buffer for resulting name + * @len: size of buffer + * + * Determine network driver for device. + */ +char *netdev_drivername(const struct net_device *dev, char *buffer, int len) +{ + const struct device_driver *driver; + const struct device *parent; + + if (len <= 0 || !buffer) + return buffer; + buffer[0] = 0; + + parent = dev->dev.parent; + + if (!parent) + return buffer; + + driver = parent->driver; + if (driver && driver->name) + strlcpy(buffer, driver->name, len); + return buffer; +} + +static void __net_exit netdev_exit(struct net *net) +{ + kfree(net->dev_name_head); + kfree(net->dev_index_head); +} + +static struct pernet_operations __net_initdata netdev_net_ops = { + .init = netdev_init, + .exit = netdev_exit, +}; + +static void __net_exit default_device_exit(struct net *net) +{ + struct net_device *dev; + /* + * Push all migratable of the network devices back to the + * initial network namespace + */ + rtnl_lock(); +restart: + for_each_netdev(net, dev) { + int err; + char fb_name[IFNAMSIZ]; + + /* Ignore unmoveable devices (i.e. loopback) */ + if (dev->features & NETIF_F_NETNS_LOCAL) + continue; + + /* Delete virtual devices */ + if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { + dev->rtnl_link_ops->dellink(dev); + goto restart; + } + + /* Push remaing network devices to init_net */ + snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); + err = dev_change_net_namespace(dev, &init_net, fb_name); + if (err) { + printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n", + __func__, dev->name, err); + BUG(); + } + goto restart; + } + rtnl_unlock(); +} + +static struct pernet_operations __net_initdata default_device_ops = { + .exit = default_device_exit, +}; + +/* + * Initialize the DEV module. At boot time this walks the device list and + * unhooks any devices that fail to initialise (normally hardware not + * present) and leaves us with a valid list of present and active devices. + * + */ + +/* + * This is called single threaded during boot, so no need + * to take the rtnl semaphore. + */ +static int __init net_dev_init(void) +{ + int i, rc = -ENOMEM; + + BUG_ON(!dev_boot_phase); + + if (dev_proc_init()) + goto out; + + if (netdev_kobject_init()) + goto out; + + INIT_LIST_HEAD(&ptype_all); + for (i = 0; i < PTYPE_HASH_SIZE; i++) + INIT_LIST_HEAD(&ptype_base[i]); + + if (register_pernet_subsys(&netdev_net_ops)) + goto out; + + /* + * Initialise the packet receive queues. + */ + + for_each_possible_cpu(i) { + struct softnet_data *queue; + + queue = &per_cpu(softnet_data, i); + skb_queue_head_init(&queue->input_pkt_queue); + queue->completion_queue = NULL; + INIT_LIST_HEAD(&queue->poll_list); + + queue->backlog.poll = process_backlog; + queue->backlog.weight = weight_p; + queue->backlog.gro_list = NULL; + } + + dev_boot_phase = 0; + + /* The loopback device is special if any other network devices + * is present in a network namespace the loopback device must + * be present. Since we now dynamically allocate and free the + * loopback device ensure this invariant is maintained by + * keeping the loopback device as the first device on the + * list of network devices. Ensuring the loopback devices + * is the first device that appears and the last network device + * that disappears. + */ +#ifndef DDE_LINUX + if (register_pernet_device(&loopback_net_ops)) + goto out; +#endif + + if (register_pernet_device(&default_device_ops)) + goto out; + + open_softirq(NET_TX_SOFTIRQ, net_tx_action); + open_softirq(NET_RX_SOFTIRQ, net_rx_action); + + hotcpu_notifier(dev_cpu_callback, 0); +#ifndef DDE_LINUX + dst_init(); +#endif + dev_mcast_init(); + rc = 0; +out: + return rc; +} + +subsys_initcall(net_dev_init); + +EXPORT_SYMBOL(__dev_get_by_index); +EXPORT_SYMBOL(__dev_get_by_name); +EXPORT_SYMBOL(__dev_remove_pack); +EXPORT_SYMBOL(dev_valid_name); +EXPORT_SYMBOL(dev_add_pack); +EXPORT_SYMBOL(dev_alloc_name); +EXPORT_SYMBOL(dev_close); +EXPORT_SYMBOL(dev_get_by_flags); +EXPORT_SYMBOL(dev_get_by_index); +EXPORT_SYMBOL(dev_get_by_name); +EXPORT_SYMBOL(dev_open); +EXPORT_SYMBOL(dev_queue_xmit); +EXPORT_SYMBOL(dev_remove_pack); +EXPORT_SYMBOL(dev_set_allmulti); +EXPORT_SYMBOL(dev_set_promiscuity); +EXPORT_SYMBOL(dev_change_flags); +EXPORT_SYMBOL(dev_set_mtu); +EXPORT_SYMBOL(dev_set_mac_address); +EXPORT_SYMBOL(free_netdev); +EXPORT_SYMBOL(netdev_boot_setup_check); +EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(netdev_state_change); +EXPORT_SYMBOL(netif_receive_skb); +EXPORT_SYMBOL(netif_rx); +EXPORT_SYMBOL(register_gifconf); +EXPORT_SYMBOL(register_netdevice); +EXPORT_SYMBOL(register_netdevice_notifier); +EXPORT_SYMBOL(skb_checksum_help); +EXPORT_SYMBOL(synchronize_net); +EXPORT_SYMBOL(unregister_netdevice); +EXPORT_SYMBOL(unregister_netdevice_notifier); +EXPORT_SYMBOL(net_enable_timestamp); +EXPORT_SYMBOL(net_disable_timestamp); +EXPORT_SYMBOL(dev_get_flags); + +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +EXPORT_SYMBOL(br_handle_frame_hook); +EXPORT_SYMBOL(br_fdb_get_hook); +EXPORT_SYMBOL(br_fdb_put_hook); +#endif + +#ifdef CONFIG_KMOD +EXPORT_SYMBOL(dev_load); +#endif + +EXPORT_PER_CPU_SYMBOL(softnet_data); diff --git a/libdde-linux26/lib/src/net/core/link_watch.c b/libdde-linux26/lib/src/net/core/link_watch.c new file mode 100644 index 00000000..1afdb815 --- /dev/null +++ b/libdde-linux26/lib/src/net/core/link_watch.c @@ -0,0 +1,238 @@ +/* + * Linux network device link state notification + * + * Author: + * Stefan Rompf <sux@loplof.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <linux/rtnetlink.h> +#include <linux/jiffies.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/bitops.h> +#include <asm/types.h> + + +enum lw_bits { + LW_URGENT = 0, +}; + +static unsigned long linkwatch_flags; +static unsigned long linkwatch_nextevent; + +static void linkwatch_event(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); + +static struct net_device *lweventlist; +static DEFINE_SPINLOCK(lweventlist_lock); + +static unsigned char default_operstate(const struct net_device *dev) +{ +#ifndef DDE_LINUX + if (!netif_carrier_ok(dev)) + return (dev->ifindex != dev->iflink ? + IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); + + if (netif_dormant(dev)) + return IF_OPER_DORMANT; +#endif + + return IF_OPER_UP; +} + + +static void rfc2863_policy(struct net_device *dev) +{ +#ifndef DDE_LINUX + unsigned char operstate = default_operstate(dev); + + if (operstate == dev->operstate) + return; + + write_lock_bh(&dev_base_lock); + + switch(dev->link_mode) { + case IF_LINK_MODE_DORMANT: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_DORMANT; + break; + + case IF_LINK_MODE_DEFAULT: + default: + break; + } + + dev->operstate = operstate; + + write_unlock_bh(&dev_base_lock); +#endif +} + + +static bool linkwatch_urgent_event(struct net_device *dev) +{ + return netif_running(dev) && netif_carrier_ok(dev) && + qdisc_tx_changing(dev); +} + + +static void linkwatch_add_event(struct net_device *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&lweventlist_lock, flags); + dev->link_watch_next = lweventlist; + lweventlist = dev; + spin_unlock_irqrestore(&lweventlist_lock, flags); +} + + +static void linkwatch_schedule_work(int urgent) +{ + unsigned long delay = linkwatch_nextevent - jiffies; + + if (test_bit(LW_URGENT, &linkwatch_flags)) + return; + + /* Minimise down-time: drop delay for up event. */ + if (urgent) { + if (test_and_set_bit(LW_URGENT, &linkwatch_flags)) + return; + delay = 0; + } + + /* If we wrap around we'll delay it by at most HZ. */ + if (delay > HZ) + delay = 0; + + /* + * This is true if we've scheduled it immeditately or if we don't + * need an immediate execution and it's already pending. + */ + if (schedule_delayed_work(&linkwatch_work, delay) == !delay) + return; + + /* Don't bother if there is nothing urgent. */ + if (!test_bit(LW_URGENT, &linkwatch_flags)) + return; + + /* It's already running which is good enough. */ + if (!cancel_delayed_work(&linkwatch_work)) + return; + + /* Otherwise we reschedule it again for immediate exection. */ + schedule_delayed_work(&linkwatch_work, 0); +} + + +static void __linkwatch_run_queue(int urgent_only) +{ +#ifndef DDE_LINUX + struct net_device *next; + + /* + * Limit the number of linkwatch events to one + * per second so that a runaway driver does not + * cause a storm of messages on the netlink + * socket. This limit does not apply to up events + * while the device qdisc is down. + */ + if (!urgent_only) + linkwatch_nextevent = jiffies + HZ; + /* Limit wrap-around effect on delay. */ + else if (time_after(linkwatch_nextevent, jiffies + HZ)) + linkwatch_nextevent = jiffies; + + clear_bit(LW_URGENT, &linkwatch_flags); + + spin_lock_irq(&lweventlist_lock); + next = lweventlist; + lweventlist = NULL; + spin_unlock_irq(&lweventlist_lock); + + while (next) { + struct net_device *dev = next; + + next = dev->link_watch_next; + + if (urgent_only && !linkwatch_urgent_event(dev)) { + linkwatch_add_event(dev); + continue; + } + + /* + * Make sure the above read is complete since it can be + * rewritten as soon as we clear the bit below. + */ + smp_mb__before_clear_bit(); + + /* We are about to handle this device, + * so new events can be accepted + */ + clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + + rfc2863_policy(dev); + if (dev->flags & IFF_UP) { + if (netif_carrier_ok(dev)) + dev_activate(dev); + else + dev_deactivate(dev); + + netdev_state_change(dev); + } + + dev_put(dev); + } + + if (lweventlist) + linkwatch_schedule_work(0); +#endif +} + + +/* Must be called with the rtnl semaphore held */ +void linkwatch_run_queue(void) +{ + __linkwatch_run_queue(0); +} + + +static void linkwatch_event(struct work_struct *dummy) +{ +#ifndef DDE_LINUX + rtnl_lock(); + __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies)); + rtnl_unlock(); +#endif +} + + +void linkwatch_fire_event(struct net_device *dev) +{ +#ifndef DDE_LINUX + bool urgent = linkwatch_urgent_event(dev); + + if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { + dev_hold(dev); + + linkwatch_add_event(dev); + } else if (!urgent) + return; + + linkwatch_schedule_work(urgent); +#endif +} + +EXPORT_SYMBOL(linkwatch_fire_event); diff --git a/libdde-linux26/lib/src/net/core/net_namespace.c b/libdde-linux26/lib/src/net/core/net_namespace.c new file mode 100644 index 00000000..ab5a0a7f --- /dev/null +++ b/libdde-linux26/lib/src/net/core/net_namespace.c @@ -0,0 +1,511 @@ +#include <linux/workqueue.h> +#include <linux/rtnetlink.h> +#include <linux/cache.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/idr.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +/* + * Our network namespace constructor/destructor lists + */ + +static LIST_HEAD(pernet_list); +static struct list_head *first_device = &pernet_list; +static DEFINE_MUTEX(net_mutex); + +LIST_HEAD(net_namespace_list); +EXPORT_SYMBOL_GPL(net_namespace_list); + +struct net init_net; +EXPORT_SYMBOL(init_net); + +#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ + +/* + * setup_net runs the initializers for the network namespace object. + */ +static __net_init int setup_net(struct net *net) +{ + /* Must be called with net_mutex held */ + struct pernet_operations *ops; + int error = 0; + + atomic_set(&net->count, 1); + +#ifdef NETNS_REFCNT_DEBUG + atomic_set(&net->use_count, 0); +#endif + + list_for_each_entry(ops, &pernet_list, list) { + if (ops->init) { + error = ops->init(net); + if (error < 0) + goto out_undo; + } + } +out: + return error; + +out_undo: + /* Walk through the list backwards calling the exit functions + * for the pernet modules whose init functions did not fail. + */ + list_for_each_entry_continue_reverse(ops, &pernet_list, list) { + if (ops->exit) + ops->exit(net); + } + +#ifndef DDE_LINUX + rcu_barrier(); +#endif + goto out; +} + +static struct net_generic *net_alloc_generic(void) +{ + struct net_generic *ng; + size_t generic_size = sizeof(struct net_generic) + + INITIAL_NET_GEN_PTRS * sizeof(void *); + + ng = kzalloc(generic_size, GFP_KERNEL); + if (ng) + ng->len = INITIAL_NET_GEN_PTRS; + + return ng; +} + +#ifdef CONFIG_NET_NS +static struct kmem_cache *net_cachep; +static struct workqueue_struct *netns_wq; + +static struct net *net_alloc(void) +{ + struct net *net = NULL; + struct net_generic *ng; + + ng = net_alloc_generic(); + if (!ng) + goto out; + + net = kmem_cache_zalloc(net_cachep, GFP_KERNEL); + if (!net) + goto out_free; + + rcu_assign_pointer(net->gen, ng); +out: + return net; + +out_free: + kfree(ng); + goto out; +} + +static void net_free(struct net *net) +{ +#ifdef NETNS_REFCNT_DEBUG + if (unlikely(atomic_read(&net->use_count) != 0)) { + printk(KERN_EMERG "network namespace not free! Usage: %d\n", + atomic_read(&net->use_count)); + return; + } +#endif + kfree(net->gen); + kmem_cache_free(net_cachep, net); +} + +struct net *copy_net_ns(unsigned long flags, struct net *old_net) +{ + struct net *new_net = NULL; + int err; + + get_net(old_net); + + if (!(flags & CLONE_NEWNET)) + return old_net; + + err = -ENOMEM; + new_net = net_alloc(); + if (!new_net) + goto out_err; + + mutex_lock(&net_mutex); + err = setup_net(new_net); + if (!err) { + rtnl_lock(); + list_add_tail(&new_net->list, &net_namespace_list); + rtnl_unlock(); + } + mutex_unlock(&net_mutex); + + if (err) + goto out_free; +out: + put_net(old_net); + return new_net; + +out_free: + net_free(new_net); +out_err: + new_net = ERR_PTR(err); + goto out; +} + +static void cleanup_net(struct work_struct *work) +{ + struct pernet_operations *ops; + struct net *net; + + net = container_of(work, struct net, work); + + mutex_lock(&net_mutex); + + /* Don't let anyone else find us. */ + rtnl_lock(); + list_del(&net->list); + rtnl_unlock(); + + /* Run all of the network namespace exit methods */ + list_for_each_entry_reverse(ops, &pernet_list, list) { + if (ops->exit) + ops->exit(net); + } + + mutex_unlock(&net_mutex); + + /* Ensure there are no outstanding rcu callbacks using this + * network namespace. + */ + rcu_barrier(); + + /* Finally it is safe to free my network namespace structure */ + net_free(net); +} + +void __put_net(struct net *net) +{ + /* Cleanup the network namespace in process context */ + INIT_WORK(&net->work, cleanup_net); + queue_work(netns_wq, &net->work); +} +EXPORT_SYMBOL_GPL(__put_net); + +#else +struct net *copy_net_ns(unsigned long flags, struct net *old_net) +{ + if (flags & CLONE_NEWNET) + return ERR_PTR(-EINVAL); + return old_net; +} +#endif + +static int __init net_ns_init(void) +{ + struct net_generic *ng; + int err; + + printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net)); +#ifdef CONFIG_NET_NS + net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), + SMP_CACHE_BYTES, + SLAB_PANIC, NULL); + + /* Create workqueue for cleanup */ + netns_wq = create_singlethread_workqueue("netns"); + if (!netns_wq) + panic("Could not create netns workq"); +#endif + + ng = net_alloc_generic(); + if (!ng) + panic("Could not allocate generic netns"); + + rcu_assign_pointer(init_net.gen, ng); + + mutex_lock(&net_mutex); + err = setup_net(&init_net); + + rtnl_lock(); + list_add_tail(&init_net.list, &net_namespace_list); + rtnl_unlock(); + + mutex_unlock(&net_mutex); + if (err) + panic("Could not setup the initial network namespace"); + + return 0; +} + +pure_initcall(net_ns_init); + +#ifdef CONFIG_NET_NS +static int register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + struct net *net, *undo_net; + int error; + + list_add_tail(&ops->list, list); + if (ops->init) { + for_each_net(net) { + error = ops->init(net); + if (error) + goto out_undo; + } + } + return 0; + +out_undo: + /* If I have an error cleanup all namespaces I initialized */ + list_del(&ops->list); + if (ops->exit) { + for_each_net(undo_net) { + if (undo_net == net) + goto undone; + ops->exit(undo_net); + } + } +undone: + return error; +} + +static void unregister_pernet_operations(struct pernet_operations *ops) +{ + struct net *net; + + list_del(&ops->list); + if (ops->exit) + for_each_net(net) + ops->exit(net); +} + +#else + +static int register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + if (ops->init == NULL) + return 0; + return ops->init(&init_net); +} + +static void unregister_pernet_operations(struct pernet_operations *ops) +{ + if (ops->exit) + ops->exit(&init_net); +} +#endif + +static DEFINE_IDA(net_generic_ids); + +/** + * register_pernet_subsys - register a network namespace subsystem + * @ops: pernet operations structure for the subsystem + * + * Register a subsystem which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_subsys(struct pernet_operations *ops) +{ + int error; + mutex_lock(&net_mutex); + error = register_pernet_operations(first_device, ops); + mutex_unlock(&net_mutex); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_subsys); + +/** + * unregister_pernet_subsys - unregister a network namespace subsystem + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destroyed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_subsys(struct pernet_operations *module) +{ + mutex_lock(&net_mutex); + unregister_pernet_operations(module); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_subsys); + +int register_pernet_gen_subsys(int *id, struct pernet_operations *ops) +{ + int rv; + + mutex_lock(&net_mutex); +again: + rv = ida_get_new_above(&net_generic_ids, 1, id); + if (rv < 0) { + if (rv == -EAGAIN) { + ida_pre_get(&net_generic_ids, GFP_KERNEL); + goto again; + } + goto out; + } + rv = register_pernet_operations(first_device, ops); + if (rv < 0) + ida_remove(&net_generic_ids, *id); +out: + mutex_unlock(&net_mutex); + return rv; +} +EXPORT_SYMBOL_GPL(register_pernet_gen_subsys); + +void unregister_pernet_gen_subsys(int id, struct pernet_operations *ops) +{ + mutex_lock(&net_mutex); + unregister_pernet_operations(ops); + ida_remove(&net_generic_ids, id); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_gen_subsys); + +/** + * register_pernet_device - register a network namespace device + * @ops: pernet operations structure for the subsystem + * + * Register a device which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_device(struct pernet_operations *ops) +{ + int error; + mutex_lock(&net_mutex); + error = register_pernet_operations(&pernet_list, ops); + if (!error && (first_device == &pernet_list)) + first_device = &ops->list; + mutex_unlock(&net_mutex); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_device); + +int register_pernet_gen_device(int *id, struct pernet_operations *ops) +{ + int error; + mutex_lock(&net_mutex); +again: + error = ida_get_new_above(&net_generic_ids, 1, id); + if (error) { + if (error == -EAGAIN) { + ida_pre_get(&net_generic_ids, GFP_KERNEL); + goto again; + } + goto out; + } + error = register_pernet_operations(&pernet_list, ops); + if (error) + ida_remove(&net_generic_ids, *id); + else if (first_device == &pernet_list) + first_device = &ops->list; +out: + mutex_unlock(&net_mutex); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_gen_device); + +/** + * unregister_pernet_device - unregister a network namespace netdevice + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destroyed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_device(struct pernet_operations *ops) +{ + mutex_lock(&net_mutex); + if (&ops->list == first_device) + first_device = first_device->next; + unregister_pernet_operations(ops); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_device); + +void unregister_pernet_gen_device(int id, struct pernet_operations *ops) +{ + mutex_lock(&net_mutex); + if (&ops->list == first_device) + first_device = first_device->next; + unregister_pernet_operations(ops); + ida_remove(&net_generic_ids, id); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_gen_device); + +static void net_generic_release(struct rcu_head *rcu) +{ + struct net_generic *ng; + + ng = container_of(rcu, struct net_generic, rcu); + kfree(ng); +} + +int net_assign_generic(struct net *net, int id, void *data) +{ + struct net_generic *ng, *old_ng; + + BUG_ON(!mutex_is_locked(&net_mutex)); + BUG_ON(id == 0); + + ng = old_ng = net->gen; + if (old_ng->len >= id) + goto assign; + + ng = kzalloc(sizeof(struct net_generic) + + id * sizeof(void *), GFP_KERNEL); + if (ng == NULL) + return -ENOMEM; + + /* + * Some synchronisation notes: + * + * The net_generic explores the net->gen array inside rcu + * read section. Besides once set the net->gen->ptr[x] + * pointer never changes (see rules in netns/generic.h). + * + * That said, we simply duplicate this array and schedule + * the old copy for kfree after a grace period. + */ + + ng->len = id; + memcpy(&ng->ptr, &old_ng->ptr, old_ng->len); + + rcu_assign_pointer(net->gen, ng); + call_rcu(&old_ng->rcu, net_generic_release); +assign: + ng->ptr[id - 1] = data; + return 0; +} +EXPORT_SYMBOL_GPL(net_assign_generic); diff --git a/libdde-linux26/lib/src/net/core/rtnetlink.c b/libdde-linux26/lib/src/net/core/rtnetlink.c new file mode 100644 index 00000000..8408e3da --- /dev/null +++ b/libdde-linux26/lib/src/net/core/rtnetlink.c @@ -0,0 +1,1436 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Routing netlink socket interface: protocol independent part. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov RTA_OK arithmetics was wrong. + */ + +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/security.h> +#include <linux/mutex.h> +#include <linux/if_addr.h> +#include <linux/nsproxy.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/string.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/arp.h> +#include <net/route.h> +#include <net/udp.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <net/fib_rules.h> +#include <net/rtnetlink.h> + +struct rtnl_link +{ + rtnl_doit_func doit; + rtnl_dumpit_func dumpit; +}; + +static DEFINE_MUTEX(rtnl_mutex); + +void rtnl_lock(void) +{ + mutex_lock(&rtnl_mutex); +} + +void __rtnl_unlock(void) +{ + mutex_unlock(&rtnl_mutex); +} + +void rtnl_unlock(void) +{ + /* This fellow will unlock it for us. */ + netdev_run_todo(); +} + +int rtnl_trylock(void) +{ + return mutex_trylock(&rtnl_mutex); +} + +int rtnl_is_locked(void) +{ + return mutex_is_locked(&rtnl_mutex); +} + +static struct rtnl_link *rtnl_msg_handlers[NPROTO]; + +static inline int rtm_msgindex(int msgtype) +{ + int msgindex = msgtype - RTM_BASE; + + /* + * msgindex < 0 implies someone tried to register a netlink + * control code. msgindex >= RTM_NR_MSGTYPES may indicate that + * the message type has not been added to linux/rtnetlink.h + */ + BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); + + return msgindex; +} + +static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex) +{ + struct rtnl_link *tab; + + tab = rtnl_msg_handlers[protocol]; + if (tab == NULL || tab[msgindex].doit == NULL) + tab = rtnl_msg_handlers[PF_UNSPEC]; + + return tab ? tab[msgindex].doit : NULL; +} + +static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) +{ + struct rtnl_link *tab; + + tab = rtnl_msg_handlers[protocol]; + if (tab == NULL || tab[msgindex].dumpit == NULL) + tab = rtnl_msg_handlers[PF_UNSPEC]; + + return tab ? tab[msgindex].dumpit : NULL; +} + +/** + * __rtnl_register - Register a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_register(int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit) +{ + struct rtnl_link *tab; + int msgindex; + + BUG_ON(protocol < 0 || protocol >= NPROTO); + msgindex = rtm_msgindex(msgtype); + + tab = rtnl_msg_handlers[protocol]; + if (tab == NULL) { + tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); + if (tab == NULL) + return -ENOBUFS; + + rtnl_msg_handlers[protocol] = tab; + } + + if (doit) + tab[msgindex].doit = doit; + + if (dumpit) + tab[msgindex].dumpit = dumpit; + + return 0; +} + +EXPORT_SYMBOL_GPL(__rtnl_register); + +/** + * rtnl_register - Register a rtnetlink message type + * + * Identical to __rtnl_register() but panics on failure. This is useful + * as failure of this function is very unlikely, it can only happen due + * to lack of memory when allocating the chain to store all message + * handlers for a protocol. Meant for use in init functions where lack + * of memory implies no sense in continueing. + */ +void rtnl_register(int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit) +{ + if (__rtnl_register(protocol, msgtype, doit, dumpit) < 0) + panic("Unable to register rtnetlink message handler, " + "protocol = %d, message type = %d\n", + protocol, msgtype); +} + +EXPORT_SYMBOL_GPL(rtnl_register); + +/** + * rtnl_unregister - Unregister a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * + * Returns 0 on success or a negative error code. + */ +int rtnl_unregister(int protocol, int msgtype) +{ + int msgindex; + + BUG_ON(protocol < 0 || protocol >= NPROTO); + msgindex = rtm_msgindex(msgtype); + + if (rtnl_msg_handlers[protocol] == NULL) + return -ENOENT; + + rtnl_msg_handlers[protocol][msgindex].doit = NULL; + rtnl_msg_handlers[protocol][msgindex].dumpit = NULL; + + return 0; +} + +EXPORT_SYMBOL_GPL(rtnl_unregister); + +/** + * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol + * @protocol : Protocol family or PF_UNSPEC + * + * Identical to calling rtnl_unregster() for all registered message types + * of a certain protocol family. + */ +void rtnl_unregister_all(int protocol) +{ + BUG_ON(protocol < 0 || protocol >= NPROTO); + + kfree(rtnl_msg_handlers[protocol]); + rtnl_msg_handlers[protocol] = NULL; +} + +EXPORT_SYMBOL_GPL(rtnl_unregister_all); + +static LIST_HEAD(link_ops); + +/** + * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. + * @ops: struct rtnl_link_ops * to register + * + * The caller must hold the rtnl_mutex. This function should be used + * by drivers that create devices during module initialization. It + * must be called before registering the devices. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_link_register(struct rtnl_link_ops *ops) +{ + if (!ops->dellink) + ops->dellink = unregister_netdevice; + + list_add_tail(&ops->list, &link_ops); + return 0; +} + +EXPORT_SYMBOL_GPL(__rtnl_link_register); + +/** + * rtnl_link_register - Register rtnl_link_ops with rtnetlink. + * @ops: struct rtnl_link_ops * to register + * + * Returns 0 on success or a negative error code. + */ +int rtnl_link_register(struct rtnl_link_ops *ops) +{ + int err; + + rtnl_lock(); + err = __rtnl_link_register(ops); + rtnl_unlock(); + return err; +} + +EXPORT_SYMBOL_GPL(rtnl_link_register); + +static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) +{ + struct net_device *dev; +restart: + for_each_netdev(net, dev) { + if (dev->rtnl_link_ops == ops) { + ops->dellink(dev); + goto restart; + } + } +} + +void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) +{ + rtnl_lock(); + __rtnl_kill_links(net, ops); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(rtnl_kill_links); + +/** + * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. + * @ops: struct rtnl_link_ops * to unregister + * + * The caller must hold the rtnl_mutex. + */ +void __rtnl_link_unregister(struct rtnl_link_ops *ops) +{ + struct net *net; + + for_each_net(net) { + __rtnl_kill_links(net, ops); + } + list_del(&ops->list); +} + +EXPORT_SYMBOL_GPL(__rtnl_link_unregister); + +/** + * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. + * @ops: struct rtnl_link_ops * to unregister + */ +void rtnl_link_unregister(struct rtnl_link_ops *ops) +{ + rtnl_lock(); + __rtnl_link_unregister(ops); + rtnl_unlock(); +} + +EXPORT_SYMBOL_GPL(rtnl_link_unregister); + +static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +{ + const struct rtnl_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) + return ops; + } + return NULL; +} + +static size_t rtnl_link_get_size(const struct net_device *dev) +{ + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + size_t size; + + if (!ops) + return 0; + + size = nlmsg_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ + nlmsg_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ + + if (ops->get_size) + /* IFLA_INFO_DATA + nested data */ + size += nlmsg_total_size(sizeof(struct nlattr)) + + ops->get_size(dev); + + if (ops->get_xstats_size) + size += ops->get_xstats_size(dev); /* IFLA_INFO_XSTATS */ + + return size; +} + +static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +{ + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + struct nlattr *linkinfo, *data; + int err = -EMSGSIZE; + + linkinfo = nla_nest_start(skb, IFLA_LINKINFO); + if (linkinfo == NULL) + goto out; + + if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) + goto err_cancel_link; + if (ops->fill_xstats) { + err = ops->fill_xstats(skb, dev); + if (err < 0) + goto err_cancel_link; + } + if (ops->fill_info) { + data = nla_nest_start(skb, IFLA_INFO_DATA); + if (data == NULL) + goto err_cancel_link; + err = ops->fill_info(skb, dev); + if (err < 0) + goto err_cancel_data; + nla_nest_end(skb, data); + } + + nla_nest_end(skb, linkinfo); + return 0; + +err_cancel_data: + nla_nest_cancel(skb, data); +err_cancel_link: + nla_nest_cancel(skb, linkinfo); +out: + return err; +} + +static const int rtm_min[RTM_NR_FAMILIES] = +{ + [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)), + [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)), + [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)), + [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), + [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), +}; + +static const int rta_max[RTM_NR_FAMILIES] = +{ + [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX, + [RTM_FAM(RTM_NEWADDR)] = IFA_MAX, + [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX, + [RTM_FAM(RTM_NEWRULE)] = FRA_MAX, + [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX, + [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX, + [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX, + [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, +}; + +#ifndef DDE_LINUX +void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ + struct rtattr *rta; + int size = RTA_LENGTH(attrlen); + + rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); + rta->rta_type = attrtype; + rta->rta_len = size; + memcpy(RTA_DATA(rta), data, attrlen); + memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); +} + +int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo) +{ + struct sock *rtnl = net->rtnl; + int err = 0; + + NETLINK_CB(skb).dst_group = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); + if (echo) + err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); + return err; +} + +int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) +{ + struct sock *rtnl = net->rtnl; + + return nlmsg_unicast(rtnl, skb, pid); +} + +int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, + struct nlmsghdr *nlh, gfp_t flags) +{ + struct sock *rtnl = net->rtnl; + int report = 0; + + if (nlh) + report = nlmsg_report(nlh); + + return nlmsg_notify(rtnl, skb, pid, group, report, flags); +} + +void rtnl_set_sk_err(struct net *net, u32 group, int error) +{ + struct sock *rtnl = net->rtnl; + + netlink_set_err(rtnl, 0, group, error); +} + +int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) +{ + struct nlattr *mx; + int i, valid = 0; + + mx = nla_nest_start(skb, RTA_METRICS); + if (mx == NULL) + return -ENOBUFS; + + for (i = 0; i < RTAX_MAX; i++) { + if (metrics[i]) { + valid++; + NLA_PUT_U32(skb, i+1, metrics[i]); + } + } + + if (!valid) { + nla_nest_cancel(skb, mx); + return 0; + } + + return nla_nest_end(skb, mx); + +nla_put_failure: + nla_nest_cancel(skb, mx); + return -EMSGSIZE; +} + +int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, + u32 ts, u32 tsage, long expires, u32 error) +{ + struct rta_cacheinfo ci = { + .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), + .rta_used = dst->__use, + .rta_clntref = atomic_read(&(dst->__refcnt)), + .rta_error = error, + .rta_id = id, + .rta_ts = ts, + .rta_tsage = tsage, + }; + + if (expires) + ci.rta_expires = jiffies_to_clock_t(expires); + + return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); +} + +EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); + +static void set_operstate(struct net_device *dev, unsigned char transition) +{ + unsigned char operstate = dev->operstate; + + switch(transition) { + case IF_OPER_UP: + if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_UNKNOWN) && + !netif_dormant(dev)) + operstate = IF_OPER_UP; + break; + + case IF_OPER_DORMANT: + if (operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN) + operstate = IF_OPER_DORMANT; + break; + } + + if (dev->operstate != operstate) { + write_lock_bh(&dev_base_lock); + dev->operstate = operstate; + write_unlock_bh(&dev_base_lock); + netdev_state_change(dev); + } +} + +static void copy_rtnl_link_stats(struct rtnl_link_stats *a, + const struct net_device_stats *b) +{ + a->rx_packets = b->rx_packets; + a->tx_packets = b->tx_packets; + a->rx_bytes = b->rx_bytes; + a->tx_bytes = b->tx_bytes; + a->rx_errors = b->rx_errors; + a->tx_errors = b->tx_errors; + a->rx_dropped = b->rx_dropped; + a->tx_dropped = b->tx_dropped; + + a->multicast = b->multicast; + a->collisions = b->collisions; + + a->rx_length_errors = b->rx_length_errors; + a->rx_over_errors = b->rx_over_errors; + a->rx_crc_errors = b->rx_crc_errors; + a->rx_frame_errors = b->rx_frame_errors; + a->rx_fifo_errors = b->rx_fifo_errors; + a->rx_missed_errors = b->rx_missed_errors; + + a->tx_aborted_errors = b->tx_aborted_errors; + a->tx_carrier_errors = b->tx_carrier_errors; + a->tx_fifo_errors = b->tx_fifo_errors; + a->tx_heartbeat_errors = b->tx_heartbeat_errors; + a->tx_window_errors = b->tx_window_errors; + + a->rx_compressed = b->rx_compressed; + a->tx_compressed = b->tx_compressed; +}; + +static inline size_t if_nlmsg_size(const struct net_device *dev) +{ + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + + nla_total_size(sizeof(struct rtnl_link_ifmap)) + + nla_total_size(sizeof(struct rtnl_link_stats)) + + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + + nla_total_size(4) /* IFLA_TXQLEN */ + + nla_total_size(4) /* IFLA_WEIGHT */ + + nla_total_size(4) /* IFLA_MTU */ + + nla_total_size(4) /* IFLA_LINK */ + + nla_total_size(4) /* IFLA_MASTER */ + + nla_total_size(1) /* IFLA_OPERSTATE */ + + nla_total_size(1) /* IFLA_LINKMODE */ + + rtnl_link_get_size(dev); /* IFLA_LINKINFO */ +} + +static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, + int type, u32 pid, u32 seq, u32 change, + unsigned int flags) +{ + struct netdev_queue *txq; + struct ifinfomsg *ifm; + struct nlmsghdr *nlh; + const struct net_device_stats *stats; + struct nlattr *attr; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifi_family = AF_UNSPEC; + ifm->__ifi_pad = 0; + ifm->ifi_type = dev->type; + ifm->ifi_index = dev->ifindex; + ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_change = change; + + NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); + NLA_PUT_U32(skb, IFLA_TXQLEN, dev->tx_queue_len); + NLA_PUT_U8(skb, IFLA_OPERSTATE, + netif_running(dev) ? dev->operstate : IF_OPER_DOWN); + NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode); + NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); + + if (dev->ifindex != dev->iflink) + NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); + + if (dev->master) + NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex); + + txq = netdev_get_tx_queue(dev, 0); + if (txq->qdisc_sleeping) + NLA_PUT_STRING(skb, IFLA_QDISC, txq->qdisc_sleeping->ops->id); + + if (dev->ifalias) + NLA_PUT_STRING(skb, IFLA_IFALIAS, dev->ifalias); + + if (1) { + struct rtnl_link_ifmap map = { + .mem_start = dev->mem_start, + .mem_end = dev->mem_end, + .base_addr = dev->base_addr, + .irq = dev->irq, + .dma = dev->dma, + .port = dev->if_port, + }; + NLA_PUT(skb, IFLA_MAP, sizeof(map), &map); + } + + if (dev->addr_len) { + NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + NLA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); + } + + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); + if (attr == NULL) + goto nla_put_failure; + + stats = dev_get_stats(dev); + copy_rtnl_link_stats(nla_data(attr), stats); + + if (dev->rtnl_link_ops) { + if (rtnl_link_fill(skb, dev) < 0) + goto nla_put_failure; + } + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int idx; + int s_idx = cb->args[0]; + struct net_device *dev; + + idx = 0; + for_each_netdev(net, dev) { + if (idx < s_idx) + goto cont; + if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, 0, NLM_F_MULTI) <= 0) + break; +cont: + idx++; + } + cb->args[0] = idx; + + return skb->len; +} + +const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, + [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, + [IFLA_MTU] = { .type = NLA_U32 }, + [IFLA_LINK] = { .type = NLA_U32 }, + [IFLA_TXQLEN] = { .type = NLA_U32 }, + [IFLA_WEIGHT] = { .type = NLA_U32 }, + [IFLA_OPERSTATE] = { .type = NLA_U8 }, + [IFLA_LINKMODE] = { .type = NLA_U8 }, + [IFLA_LINKINFO] = { .type = NLA_NESTED }, + [IFLA_NET_NS_PID] = { .type = NLA_U32 }, + [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, +}; + +static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { + [IFLA_INFO_KIND] = { .type = NLA_STRING }, + [IFLA_INFO_DATA] = { .type = NLA_NESTED }, +}; + +static struct net *get_net_ns_by_pid(pid_t pid) +{ + struct task_struct *tsk; + struct net *net; + + /* Lookup the network namespace */ + net = ERR_PTR(-ESRCH); + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (tsk) { + struct nsproxy *nsproxy; + nsproxy = task_nsproxy(tsk); + if (nsproxy) + net = get_net(nsproxy->net_ns); + } + rcu_read_unlock(); + return net; +} + +static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) +{ + if (dev) { + if (tb[IFLA_ADDRESS] && + nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) + return -EINVAL; + + if (tb[IFLA_BROADCAST] && + nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) + return -EINVAL; + } + + return 0; +} + +static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, + struct nlattr **tb, char *ifname, int modified) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int send_addr_notify = 0; + int err; + + if (tb[IFLA_NET_NS_PID]) { + struct net *net; + net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + if (IS_ERR(net)) { + err = PTR_ERR(net); + goto errout; + } + err = dev_change_net_namespace(dev, net, ifname); + put_net(net); + if (err) + goto errout; + modified = 1; + } + + if (tb[IFLA_MAP]) { + struct rtnl_link_ifmap *u_map; + struct ifmap k_map; + + if (!ops->ndo_set_config) { + err = -EOPNOTSUPP; + goto errout; + } + + if (!netif_device_present(dev)) { + err = -ENODEV; + goto errout; + } + + u_map = nla_data(tb[IFLA_MAP]); + k_map.mem_start = (unsigned long) u_map->mem_start; + k_map.mem_end = (unsigned long) u_map->mem_end; + k_map.base_addr = (unsigned short) u_map->base_addr; + k_map.irq = (unsigned char) u_map->irq; + k_map.dma = (unsigned char) u_map->dma; + k_map.port = (unsigned char) u_map->port; + + err = ops->ndo_set_config(dev, &k_map); + if (err < 0) + goto errout; + + modified = 1; + } + + if (tb[IFLA_ADDRESS]) { + struct sockaddr *sa; + int len; + + if (!ops->ndo_set_mac_address) { + err = -EOPNOTSUPP; + goto errout; + } + + if (!netif_device_present(dev)) { + err = -ENODEV; + goto errout; + } + + len = sizeof(sa_family_t) + dev->addr_len; + sa = kmalloc(len, GFP_KERNEL); + if (!sa) { + err = -ENOMEM; + goto errout; + } + sa->sa_family = dev->type; + memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), + dev->addr_len); + err = ops->ndo_set_mac_address(dev, sa); + kfree(sa); + if (err) + goto errout; + send_addr_notify = 1; + modified = 1; + } + + if (tb[IFLA_MTU]) { + err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + if (err < 0) + goto errout; + modified = 1; + } + + /* + * Interface selected by interface index but interface + * name provided implies that a name change has been + * requested. + */ + if (ifm->ifi_index > 0 && ifname[0]) { + err = dev_change_name(dev, ifname); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_IFALIAS]) { + err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), + nla_len(tb[IFLA_IFALIAS])); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_BROADCAST]) { + nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); + send_addr_notify = 1; + } + + if (ifm->ifi_flags || ifm->ifi_change) { + unsigned int flags = ifm->ifi_flags; + + /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ + if (ifm->ifi_change) + flags = (flags & ifm->ifi_change) | + (dev->flags & ~ifm->ifi_change); + err = dev_change_flags(dev, flags); + if (err < 0) + goto errout; + } + + if (tb[IFLA_TXQLEN]) + dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + + if (tb[IFLA_OPERSTATE]) + set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); + + if (tb[IFLA_LINKMODE]) { + write_lock_bh(&dev_base_lock); + dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + write_unlock_bh(&dev_base_lock); + } + + err = 0; + +errout: + if (err < 0 && modified && net_ratelimit()) + printk(KERN_WARNING "A link change request failed with " + "some changes comitted already. Interface %s may " + "have been left with an inconsistent configuration, " + "please check.\n", dev->name); + + if (send_addr_notify) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} + +static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + struct net_device *dev; + int err; + struct nlattr *tb[IFLA_MAX+1]; + char ifname[IFNAMSIZ]; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + goto errout; + + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else + ifname[0] = '\0'; + + err = -EINVAL; + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME]) + dev = dev_get_by_name(net, ifname); + else + goto errout; + + if (dev == NULL) { + err = -ENODEV; + goto errout; + } + + if ((err = validate_linkmsg(dev, tb)) < 0) + goto errout_dev; + + err = do_setlink(dev, ifm, tb, ifname, 0); +errout_dev: + dev_put(dev); +errout: + return err; +} + +static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + const struct rtnl_link_ops *ops; + struct net_device *dev; + struct ifinfomsg *ifm; + char ifname[IFNAMSIZ]; + struct nlattr *tb[IFLA_MAX+1]; + int err; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + return err; + + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME]) + dev = __dev_get_by_name(net, ifname); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + ops = dev->rtnl_link_ops; + if (!ops) + return -EOPNOTSUPP; + + ops->dellink(dev); + return 0; +} + +struct net_device *rtnl_create_link(struct net *net, char *ifname, + const struct rtnl_link_ops *ops, struct nlattr *tb[]) +{ + int err; + struct net_device *dev; + + err = -ENOMEM; + dev = alloc_netdev(ops->priv_size, ifname, ops->setup); + if (!dev) + goto err; + + if (strchr(dev->name, '%')) { + err = dev_alloc_name(dev, dev->name); + if (err < 0) + goto err_free; + } + + dev_net_set(dev, net); + dev->rtnl_link_ops = ops; + + if (tb[IFLA_MTU]) + dev->mtu = nla_get_u32(tb[IFLA_MTU]); + if (tb[IFLA_ADDRESS]) + memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), + nla_len(tb[IFLA_ADDRESS])); + if (tb[IFLA_BROADCAST]) + memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), + nla_len(tb[IFLA_BROADCAST])); + if (tb[IFLA_TXQLEN]) + dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + if (tb[IFLA_OPERSTATE]) + set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); + if (tb[IFLA_LINKMODE]) + dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + + return dev; + +err_free: + free_netdev(dev); +err: + return ERR_PTR(err); +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + const struct rtnl_link_ops *ops; + struct net_device *dev; + struct ifinfomsg *ifm; + char kind[MODULE_NAME_LEN]; + char ifname[IFNAMSIZ]; + struct nlattr *tb[IFLA_MAX+1]; + struct nlattr *linkinfo[IFLA_INFO_MAX+1]; + int err; + +#ifdef CONFIG_MODULES +replay: +#endif + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + return err; + + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else + ifname[0] = '\0'; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (ifname[0]) + dev = __dev_get_by_name(net, ifname); + else + dev = NULL; + + if ((err = validate_linkmsg(dev, tb)) < 0) + return err; + + if (tb[IFLA_LINKINFO]) { + err = nla_parse_nested(linkinfo, IFLA_INFO_MAX, + tb[IFLA_LINKINFO], ifla_info_policy); + if (err < 0) + return err; + } else + memset(linkinfo, 0, sizeof(linkinfo)); + + if (linkinfo[IFLA_INFO_KIND]) { + nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind); + } else { + kind[0] = '\0'; + ops = NULL; + } + + if (1) { + struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; + + if (ops) { + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + err = nla_parse_nested(attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy); + if (err < 0) + return err; + data = attr; + } + if (ops->validate) { + err = ops->validate(tb, data); + if (err < 0) + return err; + } + } + + if (dev) { + int modified = 0; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || + !ops->changelink) + return -EOPNOTSUPP; + + err = ops->changelink(dev, tb, data); + if (err < 0) + return err; + modified = 1; + } + + return do_setlink(dev, ifm, tb, ifname, modified); + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + return -ENODEV; + + if (ifm->ifi_index || ifm->ifi_flags || ifm->ifi_change) + return -EOPNOTSUPP; + if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; + + if (!ops) { +#ifdef CONFIG_MODULES + if (kind[0]) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + if (ops) + goto replay; + } +#endif + return -EOPNOTSUPP; + } + + if (!ifname[0]) + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + + dev = rtnl_create_link(net, ifname, ops, tb); + + if (IS_ERR(dev)) + err = PTR_ERR(dev); + else if (ops->newlink) + err = ops->newlink(dev, tb, data); + else + err = register_netdevice(dev); + + if (err < 0 && !IS_ERR(dev)) + free_netdev(dev); + return err; + } +} + +static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev = NULL; + struct sk_buff *nskb; + int err; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + return err; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) { + dev = dev_get_by_index(net, ifm->ifi_index); + if (dev == NULL) + return -ENODEV; + } else + return -EINVAL; + + nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); + if (nskb == NULL) { + err = -ENOBUFS; + goto errout; + } + + err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, 0, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(nskb); + goto errout; + } + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); +errout: + dev_put(dev); + + return err; +} + +static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->family; + + if (s_idx == 0) + s_idx = 1; + for (idx=1; idx<NPROTO; idx++) { + int type = cb->nlh->nlmsg_type-RTM_BASE; + if (idx < s_idx || idx == PF_PACKET) + continue; + if (rtnl_msg_handlers[idx] == NULL || + rtnl_msg_handlers[idx][type].dumpit == NULL) + continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) + break; + } + cb->family = idx; + + return skb->len; +} +#endif /* DDE_LINUX */ + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) +{ + struct net *net = dev_net(dev); +#ifndef DDE_LINUX + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_LINK, err); +#endif /* DDE_LINUX */ +} + +#ifndef DDE_LINUX +/* Protected by RTNL sempahore. */ +static struct rtattr **rta_buf; +static int rtattr_max; + +/* Process one rtnetlink message. */ + +static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + rtnl_doit_func doit; + int sz_idx, kind; + int min_len; + int family; + int type; + int err; + + type = nlh->nlmsg_type; + if (type > RTM_MAX) + return -EOPNOTSUPP; + + type -= RTM_BASE; + + /* All the messages must have at least 1 byte length */ + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) + return 0; + + family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; + if (family >= NPROTO) + return -EAFNOSUPPORT; + + sz_idx = type>>2; + kind = type&3; + + if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) + return -EPERM; + + if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + struct sock *rtnl; + rtnl_dumpit_func dumpit; + + dumpit = rtnl_get_dumpit(family, type); + if (dumpit == NULL) + return -EOPNOTSUPP; + + __rtnl_unlock(); + rtnl = net->rtnl; + err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL); + rtnl_lock(); + return err; + } + + memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); + + min_len = rtm_min[sz_idx]; + if (nlh->nlmsg_len < min_len) + return -EINVAL; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > rta_max[sz_idx]) + return -EINVAL; + rta_buf[flavor-1] = attr; + } + attr = RTA_NEXT(attr, attrlen); + } + } + + doit = rtnl_get_doit(family, type); + if (doit == NULL) + return -EOPNOTSUPP; + + return doit(skb, nlh, (void *)&rta_buf[0]); +} + +static void rtnetlink_rcv(struct sk_buff *skb) +{ + rtnl_lock(); + netlink_rcv_skb(skb, &rtnetlink_rcv_msg); + rtnl_unlock(); +} + +static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + switch (event) { + case NETDEV_UNREGISTER: + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + break; + case NETDEV_REGISTER: + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + break; + case NETDEV_UP: + case NETDEV_DOWN: + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + break; + case NETDEV_CHANGE: + case NETDEV_GOING_DOWN: + break; + default: + rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block rtnetlink_dev_notifier = { + .notifier_call = rtnetlink_event, +}; + + +static int rtnetlink_net_init(struct net *net) +{ + struct sock *sk; + sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, + rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); + if (!sk) + return -ENOMEM; + net->rtnl = sk; + return 0; +} + +static void rtnetlink_net_exit(struct net *net) +{ + netlink_kernel_release(net->rtnl); + net->rtnl = NULL; +} + +static struct pernet_operations rtnetlink_net_ops = { + .init = rtnetlink_net_init, + .exit = rtnetlink_net_exit, +}; + +void __init rtnetlink_init(void) +{ + int i; + + rtattr_max = 0; + for (i = 0; i < ARRAY_SIZE(rta_max); i++) + if (rta_max[i] > rtattr_max) + rtattr_max = rta_max[i]; + rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL); + if (!rta_buf) + panic("rtnetlink_init: cannot allocate rta_buf\n"); + + if (register_pernet_subsys(&rtnetlink_net_ops)) + panic("rtnetlink_init: cannot initialize rtnetlink\n"); + + netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); + register_netdevice_notifier(&rtnetlink_dev_notifier); + + rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo); + rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL); + rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL); + rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL); + + rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all); + rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all); +} + +EXPORT_SYMBOL(__rta_fill); +EXPORT_SYMBOL(rtnetlink_put_metrics); +EXPORT_SYMBOL(rtnl_lock); +EXPORT_SYMBOL(rtnl_trylock); +EXPORT_SYMBOL(rtnl_unlock); +EXPORT_SYMBOL(rtnl_is_locked); +EXPORT_SYMBOL(rtnl_unicast); +EXPORT_SYMBOL(rtnl_notify); +EXPORT_SYMBOL(rtnl_set_sk_err); +EXPORT_SYMBOL(rtnl_create_link); +EXPORT_SYMBOL(ifla_policy); +#endif /* !DDE_LINUX */ diff --git a/libdde-linux26/lib/src/net/core/skbuff.c b/libdde-linux26/lib/src/net/core/skbuff.c new file mode 100644 index 00000000..40d64a88 --- /dev/null +++ b/libdde-linux26/lib/src/net/core/skbuff.c @@ -0,0 +1,2956 @@ +/* + * Routines having to do with the 'struct sk_buff' memory handlers. + * + * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> + * Florian La Roche <rzsfl@rz.uni-sb.de> + * + * Fixes: + * Alan Cox : Fixed the worst of the load + * balancer bugs. + * Dave Platt : Interrupt stacking fix. + * Richard Kooijman : Timestamp fixes. + * Alan Cox : Changed buffer format. + * Alan Cox : destructor hook for AF_UNIX etc. + * Linus Torvalds : Better skb_clone. + * Alan Cox : Added skb_copy. + * Alan Cox : Added all the changed routines Linus + * only put in the headers + * Ray VanTassle : Fixed --skb->lock in free + * Alan Cox : skb_copy copy arp field + * Andi Kleen : slabified it. + * Robert Olsson : Removed skb_head_pool + * + * NOTE: + * The __skb_ routines should be called with interrupts + * disabled, or you better be *real* sure that the operation is atomic + * with respect to whatever list is being frobbed (e.g. via lock_sock() + * or via disabling bottom half handlers, etc). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * The functions in this file will not compile correctly with gcc 2.4.x + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#ifdef CONFIG_NET_CLS_ACT +#include <net/pkt_sched.h> +#endif +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/splice.h> +#include <linux/cache.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/scatterlist.h> + +#include <net/protocol.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/checksum.h> +#ifndef DDE_LINUX +#include <net/xfrm.h> +#endif /* DDE_LINUX */ + +#include "local.h" + +#include <asm/uaccess.h> +#include <asm/system.h> + +#include "kmap_skb.h" + +static struct kmem_cache *skbuff_head_cache __read_mostly; +static struct kmem_cache *skbuff_fclone_cache __read_mostly; + +static void sock_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + put_page(buf->page); +} + +static void sock_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + get_page(buf->page); +} + +static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + + +/* Pipe buffer operations for a socket. */ +static struct pipe_buf_operations sock_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = sock_pipe_buf_release, + .steal = sock_pipe_buf_steal, + .get = sock_pipe_buf_get, +}; + +/* + * Keep out-of-line to prevent kernel bloat. + * __builtin_return_address is not used because it is not always + * reliable. + */ + +/** + * skb_over_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_put(). Not user callable. + */ +void skb_over_panic(struct sk_buff *skb, int sz, void *here) +{ + printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " + "data:%p tail:%#lx end:%#lx dev:%s\n", + here, skb->len, sz, skb->head, skb->data, + (unsigned long)skb->tail, (unsigned long)skb->end, + skb->dev ? skb->dev->name : "<NULL>"); + BUG(); +} + +/** + * skb_under_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_push(). Not user callable. + */ + +void skb_under_panic(struct sk_buff *skb, int sz, void *here) +{ + printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " + "data:%p tail:%#lx end:%#lx dev:%s\n", + here, skb->len, sz, skb->head, skb->data, + (unsigned long)skb->tail, (unsigned long)skb->end, + skb->dev ? skb->dev->name : "<NULL>"); + BUG(); +} + +/* Allocate a new skbuff. We do this ourselves so we can fill in a few + * 'private' fields and also do memory statistics to find all the + * [BEEP] leaks. + * + */ + +/** + * __alloc_skb - allocate a network buffer + * @size: size to allocate + * @gfp_mask: allocation mask + * @fclone: allocate from fclone cache instead of head cache + * and allocate a cloned (child) skb + * @node: numa node to allocate memory on + * + * Allocate a new &sk_buff. The returned buffer has no headroom and a + * tail room of size bytes. The object has a reference count of one. + * The return is the buffer. On a failure the return is %NULL. + * + * Buffers may only be allocated from interrupts using a @gfp_mask of + * %GFP_ATOMIC. + */ +struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, + int fclone, int node) +{ + struct kmem_cache *cache; + struct skb_shared_info *shinfo; + struct sk_buff *skb; + u8 *data; + + cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; + + /* Get the HEAD */ + skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); + if (!skb) { + printk("kmem_cache_alloc_node fails\n"); + goto out; + } + + size = SKB_DATA_ALIGN(size); + data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), + gfp_mask, node); + if (!data) { + printk("kmalloc_node_track_caller %d fails\n", + size + sizeof(struct skb_shared_info)); + goto nodata; + } + + /* + * Only clear those fields we need to clear, not those that we will + * actually initialise below. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->truesize = size + sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; + skb->del_data = NULL; + skb->pre_del_func = NULL; + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + atomic_set(&shinfo->dataref, 1); + shinfo->nr_frags = 0; + shinfo->gso_size = 0; + shinfo->gso_segs = 0; + shinfo->gso_type = 0; + shinfo->ip6_frag_id = 0; + shinfo->frag_list = NULL; + + if (fclone) { + struct sk_buff *child = skb + 1; + atomic_t *fclone_ref = (atomic_t *) (child + 1); + + skb->fclone = SKB_FCLONE_ORIG; + atomic_set(fclone_ref, 1); + + child->fclone = SKB_FCLONE_UNAVAILABLE; + } +out: + return skb; +nodata: + kmem_cache_free(cache, skb); + skb = NULL; + goto out; +} + +/** + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) +{ + int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; + struct sk_buff *skb; + + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node); + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + } + return skb; +} + +struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask) +{ + int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; + struct page *page; + + page = alloc_pages_node(node, gfp_mask, 0); + return page; +} +EXPORT_SYMBOL(__netdev_alloc_page); + +void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, + int size) +{ + skb_fill_page_desc(skb, i, page, off, size); + skb->len += size; + skb->data_len += size; + skb->truesize += size; +} +EXPORT_SYMBOL(skb_add_rx_frag); + +/** + * dev_alloc_skb - allocate an skbuff for receiving + * @length: length to allocate + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. Although this function + * allocates memory it can be called from an interrupt. + */ +struct sk_buff *dev_alloc_skb(unsigned int length) +{ + /* + * There is more code here than it seems: + * __dev_alloc_skb is an inline + */ + return __dev_alloc_skb(length, GFP_ATOMIC); +} +EXPORT_SYMBOL(dev_alloc_skb); + +static void skb_drop_list(struct sk_buff **listp) +{ + struct sk_buff *list = *listp; + + *listp = NULL; + + do { + struct sk_buff *this = list; + list = list->next; + kfree_skb(this); + } while (list); +} + +static inline void skb_drop_fraglist(struct sk_buff *skb) +{ + skb_drop_list(&skb_shinfo(skb)->frag_list); +} + +static void skb_clone_fraglist(struct sk_buff *skb) +{ + struct sk_buff *list; + + for (list = skb_shinfo(skb)->frag_list; list; list = list->next) + skb_get(list); +} + +static void skb_release_data(struct sk_buff *skb) +{ + if (!skb->cloned || + !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &skb_shinfo(skb)->dataref)) { + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + put_page(skb_shinfo(skb)->frags[i].page); + } + + if (skb_shinfo(skb)->frag_list) + skb_drop_fraglist(skb); + + kfree(skb->head); + } +} + +/* + * Free an skbuff by memory without cleaning the state. + */ +static void kfree_skbmem(struct sk_buff *skb) +{ + struct sk_buff *other; + atomic_t *fclone_ref; + + switch (skb->fclone) { + case SKB_FCLONE_UNAVAILABLE: + kmem_cache_free(skbuff_head_cache, skb); + break; + + case SKB_FCLONE_ORIG: + fclone_ref = (atomic_t *) (skb + 2); + if (atomic_dec_and_test(fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, skb); + break; + + case SKB_FCLONE_CLONE: + fclone_ref = (atomic_t *) (skb + 1); + other = skb - 1; + + /* The clone portion is available for + * fast-cloning again. + */ + skb->fclone = SKB_FCLONE_UNAVAILABLE; + + if (atomic_dec_and_test(fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, other); + break; + } +} + +static void skb_release_head_state(struct sk_buff *skb) +{ +#ifndef DDE_LINUX + dst_release(skb->dst); +#endif +#ifdef CONFIG_XFRM + secpath_put(skb->sp); +#endif + if (skb->destructor) { + WARN_ON(in_irq()); + skb->destructor(skb); + } +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put(skb->nfct); + nf_conntrack_put_reasm(skb->nfct_reasm); +#endif +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(skb->nf_bridge); +#endif +/* XXX: IS this still necessary? - JHS */ +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = 0; +#endif +#endif +} + +/* Free everything but the sk_buff shell. */ +static void skb_release_all(struct sk_buff *skb) +{ + skb_release_head_state(skb); + skb_release_data(skb); +} + +/** + * __kfree_skb - private function + * @skb: buffer + * + * Free an sk_buff. Release anything attached to the buffer. + * Clean the state. This is an internal helper function. Users should + * always call kfree_skb + */ + +void __kfree_skb(struct sk_buff *skb) +{ +#ifdef DDE_LINUX + if (skb->del_data && skb->pre_del_func + && skb->pre_del_func(skb, skb->del_data)) + return; +#endif + skb_release_all(skb); + kfree_skbmem(skb); +} + +/** + * kfree_skb - free an sk_buff + * @skb: buffer to free + * + * Drop a reference to the buffer and free it if the usage count has + * hit zero. + */ +void kfree_skb(struct sk_buff *skb) +{ + if (unlikely(!skb)) + return; +#ifdef DDE_LINUX + if (atomic_read(&skb->users) == 0) { + __kfree_skb(skb); + return; + } +#endif + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + __kfree_skb(skb); +} + +/** + * skb_recycle_check - check if skb can be reused for receive + * @skb: buffer + * @skb_size: minimum receive buffer size + * + * Checks that the skb passed in is not shared or cloned, and + * that it is linear and its head portion at least as large as + * skb_size so that it can be recycled as a receive buffer. + * If these conditions are met, this function does any necessary + * reference count dropping and cleans up the skbuff as if it + * just came from __alloc_skb(). + */ +int skb_recycle_check(struct sk_buff *skb, int skb_size) +{ + struct skb_shared_info *shinfo; + + if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) + return 0; + + skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD); + if (skb_end_pointer(skb) - skb->head < skb_size) + return 0; + + if (skb_shared(skb) || skb_cloned(skb)) + return 0; + + skb_release_head_state(skb); + shinfo = skb_shinfo(skb); + atomic_set(&shinfo->dataref, 1); + shinfo->nr_frags = 0; + shinfo->gso_size = 0; + shinfo->gso_segs = 0; + shinfo->gso_type = 0; + shinfo->ip6_frag_id = 0; + shinfo->frag_list = NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->data = skb->head + NET_SKB_PAD; + skb_reset_tail_pointer(skb); + + return 1; +} +EXPORT_SYMBOL(skb_recycle_check); + +static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ + new->tstamp = old->tstamp; + new->dev = old->dev; + new->transport_header = old->transport_header; + new->network_header = old->network_header; + new->mac_header = old->mac_header; + new->dst = dst_clone(old->dst); +#ifdef CONFIG_XFRM + new->sp = secpath_get(old->sp); +#endif + memcpy(new->cb, old->cb, sizeof(old->cb)); + new->csum_start = old->csum_start; + new->csum_offset = old->csum_offset; + new->local_df = old->local_df; + new->pkt_type = old->pkt_type; + new->ip_summed = old->ip_summed; + skb_copy_queue_mapping(new, old); + new->priority = old->priority; +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) + new->ipvs_property = old->ipvs_property; +#endif + new->protocol = old->protocol; + new->mark = old->mark; + __nf_copy(new, old); +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) + new->nf_trace = old->nf_trace; +#endif +#ifdef CONFIG_NET_SCHED + new->tc_index = old->tc_index; +#ifdef CONFIG_NET_CLS_ACT + new->tc_verd = old->tc_verd; +#endif +#endif + new->vlan_tci = old->vlan_tci; + + skb_copy_secmark(new, old); +} + +static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) +{ +#define C(x) n->x = skb->x + + n->next = n->prev = NULL; + n->sk = NULL; + __copy_skb_header(n, skb); + + C(len); + C(data_len); + C(mac_len); + n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; + n->cloned = 1; + n->nohdr = 0; + n->destructor = NULL; + C(iif); + C(tail); + C(end); + C(head); + C(data); + C(truesize); +#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) + C(do_not_encrypt); + C(requeue); +#endif + atomic_set(&n->users, 1); + + atomic_inc(&(skb_shinfo(skb)->dataref)); + skb->cloned = 1; + + return n; +#undef C +} + +/** + * skb_morph - morph one skb into another + * @dst: the skb to receive the contents + * @src: the skb to supply the contents + * + * This is identical to skb_clone except that the target skb is + * supplied by the user. + * + * The target skb is returned upon exit. + */ +struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) +{ + skb_release_all(dst); + return __skb_clone(dst, src); +} +EXPORT_SYMBOL_GPL(skb_morph); + +/** + * skb_clone - duplicate an sk_buff + * @skb: buffer to clone + * @gfp_mask: allocation priority + * + * Duplicate an &sk_buff. The new one is not owned by a socket. Both + * copies share the same packet data but not structure. The new + * buffer has a reference count of 1. If the allocation fails the + * function returns %NULL otherwise the new buffer is returned. + * + * If this function is called from an interrupt gfp_mask() must be + * %GFP_ATOMIC. + */ + +struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) +{ + struct sk_buff *n; + + n = skb + 1; + if (skb->fclone == SKB_FCLONE_ORIG && + n->fclone == SKB_FCLONE_UNAVAILABLE) { + atomic_t *fclone_ref = (atomic_t *) (n + 1); + n->fclone = SKB_FCLONE_CLONE; + atomic_inc(fclone_ref); + } else { + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (!n) + return NULL; + n->fclone = SKB_FCLONE_UNAVAILABLE; + } + + return __skb_clone(n, skb); +} + +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ +#ifndef NET_SKBUFF_DATA_USES_OFFSET + /* + * Shift between the two data areas in bytes + */ + unsigned long offset = new->data - old->data; +#endif + + __copy_skb_header(new, old); + +#ifndef NET_SKBUFF_DATA_USES_OFFSET + /* {transport,network,mac}_header are relative to skb->head */ + new->transport_header += offset; + new->network_header += offset; + new->mac_header += offset; +#endif + skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; + skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; + skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; +} + +/** + * skb_copy - create private copy of an sk_buff + * @skb: buffer to copy + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data. This is used when the + * caller wishes to modify the data and needs a private copy of the + * data to alter. Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * As by-product this function converts non-linear &sk_buff to linear + * one, so that &sk_buff becomes completely private and caller is allowed + * to modify all the data of returned buffer. This means that this + * function is not recommended for use in circumstances when only + * header is going to be modified. Use pskb_copy() instead. + */ + +struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) +{ + int headerlen = skb->data - skb->head; + /* + * Allocate the copy buffer + */ + struct sk_buff *n; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + n = alloc_skb(skb->end + skb->data_len, gfp_mask); +#else + n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask); +#endif + if (!n) + return NULL; + + /* Set the data pointer */ + skb_reserve(n, headerlen); + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) + BUG(); + + copy_skb_header(n, skb); + return n; +} + + +/** + * pskb_copy - create copy of an sk_buff with private head. + * @skb: buffer to copy + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and part of its data, located + * in header. Fragmented data remain shared. This is used when + * the caller wishes to modify only header of &sk_buff and needs + * private copy of the header to alter. Returns %NULL on failure + * or the pointer to the buffer on success. + * The returned buffer has a reference count of 1. + */ + +struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) +{ + /* + * Allocate the copy buffer + */ + struct sk_buff *n; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + n = alloc_skb(skb->end, gfp_mask); +#else + n = alloc_skb(skb->end - skb->head, gfp_mask); +#endif + if (!n) + goto out; + + /* Set the data pointer */ + skb_reserve(n, skb->data - skb->head); + /* Set the tail pointer and length */ + skb_put(n, skb_headlen(skb)); + /* Copy the bytes */ + skb_copy_from_linear_data(skb, n->data, n->len); + + n->truesize += skb->data_len; + n->data_len = skb->data_len; + n->len = skb->len; + + if (skb_shinfo(skb)->nr_frags) { + int i; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; + get_page(skb_shinfo(n)->frags[i].page); + } + skb_shinfo(n)->nr_frags = i; + } + + if (skb_shinfo(skb)->frag_list) { + skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; + skb_clone_fraglist(n); + } + + copy_skb_header(n, skb); +out: + return n; +} + +/** + * pskb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @nhead: room to add at head + * @ntail: room to add at tail + * @gfp_mask: allocation priority + * + * Expands (or creates identical copy, if &nhead and &ntail are zero) + * header of skb. &sk_buff itself is not changed. &sk_buff MUST have + * reference count of 1. Returns zero in the case of success or error, + * if expansion failed. In the last case, &sk_buff is not changed. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, + gfp_t gfp_mask) +{ + int i; + u8 *data; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + int size = nhead + skb->end + ntail; +#else + int size = nhead + (skb->end - skb->head) + ntail; +#endif + long off; + + BUG_ON(nhead < 0); + + if (skb_shared(skb)) + BUG(); + + size = SKB_DATA_ALIGN(size); + + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + if (!data) + goto nodata; + + /* Copy only real data... and, alas, header. This should be + * optimized for the cases when header is void. */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET + memcpy(data + nhead, skb->head, skb->tail); +#else + memcpy(data + nhead, skb->head, skb->tail - skb->head); +#endif + memcpy(data + size, skb_end_pointer(skb), + sizeof(struct skb_shared_info)); + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + get_page(skb_shinfo(skb)->frags[i].page); + + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + + skb_release_data(skb); + + off = (data + nhead) - skb->head; + + skb->head = data; + skb->data += off; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; + off = nhead; +#else + skb->end = skb->head + size; +#endif + /* {transport,network,mac}_header and tail are relative to skb->head */ + skb->tail += off; + skb->transport_header += off; + skb->network_header += off; + skb->mac_header += off; + skb->csum_start += nhead; + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + return 0; + +nodata: + return -ENOMEM; +} + +/* Make private copy of skb with writable head and some headroom */ + +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) +{ + struct sk_buff *skb2; + int delta = headroom - skb_headroom(skb); + + if (delta <= 0) + skb2 = pskb_copy(skb, GFP_ATOMIC); + else { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, + GFP_ATOMIC)) { + kfree_skb(skb2); + skb2 = NULL; + } + } + return skb2; +} + + +/** + * skb_copy_expand - copy and expand sk_buff + * @skb: buffer to copy + * @newheadroom: new free bytes at head + * @newtailroom: new free bytes at tail + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data and while doing so + * allocate additional space. + * + * This is used when the caller wishes to modify the data and needs a + * private copy of the data to alter as well as more space for new fields. + * Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * You must pass %GFP_ATOMIC as the allocation priority if this function + * is called from an interrupt. + */ +struct sk_buff *skb_copy_expand(const struct sk_buff *skb, + int newheadroom, int newtailroom, + gfp_t gfp_mask) +{ + /* + * Allocate the copy buffer + */ + struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask); + int oldheadroom = skb_headroom(skb); + int head_copy_len, head_copy_off; + int off; + + if (!n) + return NULL; + + skb_reserve(n, newheadroom); + + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + head_copy_len = oldheadroom; + head_copy_off = 0; + if (newheadroom <= head_copy_len) + head_copy_len = newheadroom; + else + head_copy_off = newheadroom - head_copy_len; + + /* Copy the linear header and data. */ + if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)) + BUG(); + + copy_skb_header(n, skb); + + off = newheadroom - oldheadroom; + n->csum_start += off; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + n->transport_header += off; + n->network_header += off; + n->mac_header += off; +#endif + + return n; +} + +/** + * skb_pad - zero pad the tail of an skb + * @skb: buffer to pad + * @pad: space to pad + * + * Ensure that a buffer is followed by a padding area that is zero + * filled. Used by network drivers which may DMA or transfer data + * beyond the buffer end onto the wire. + * + * May return error in out of memory cases. The skb is freed on error. + */ + +int skb_pad(struct sk_buff *skb, int pad) +{ + int err; + int ntail; + + /* If the skbuff is non linear tailroom is always zero.. */ + if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { + memset(skb->data+skb->len, 0, pad); + return 0; + } + + ntail = skb->data_len + pad - (skb->end - skb->tail); + if (likely(skb_cloned(skb) || ntail > 0)) { + err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); + if (unlikely(err)) + goto free_skb; + } + + /* FIXME: The use of this function with non-linear skb's really needs + * to be audited. + */ + err = skb_linearize(skb); + if (unlikely(err)) + goto free_skb; + + memset(skb->data + skb->len, 0, pad); + return 0; + +free_skb: + kfree_skb(skb); + return err; +} + +/** + * skb_put - add data to a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer. If this would + * exceed the total buffer size the kernel will panic. A pointer to the + * first byte of the extra data is returned. + */ +unsigned char *skb_put(struct sk_buff *skb, unsigned int len) +{ + unsigned char *tmp = skb_tail_pointer(skb); + SKB_LINEAR_ASSERT(skb); + skb->tail += len; + skb->len += len; + if (unlikely(skb->tail > skb->end)) + skb_over_panic(skb, len, __builtin_return_address(0)); + return tmp; +} +EXPORT_SYMBOL(skb_put); + +/** + * skb_push - add data to the start of a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer at the buffer + * start. If this would exceed the total buffer headroom the kernel will + * panic. A pointer to the first byte of the extra data is returned. + */ +unsigned char *skb_push(struct sk_buff *skb, unsigned int len) +{ + skb->data -= len; + skb->len += len; + if (unlikely(skb->data<skb->head)) + skb_under_panic(skb, len, __builtin_return_address(0)); + return skb->data; +} +EXPORT_SYMBOL(skb_push); + +/** + * skb_pull - remove data from the start of a buffer + * @skb: buffer to use + * @len: amount of data to remove + * + * This function removes data from the start of a buffer, returning + * the memory to the headroom. A pointer to the next data in the buffer + * is returned. Once the data has been pulled future pushes will overwrite + * the old data. + */ +unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) +{ + return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); +} +EXPORT_SYMBOL(skb_pull); + +/** + * skb_trim - remove end from a buffer + * @skb: buffer to alter + * @len: new length + * + * Cut the length of a buffer down by removing data from the tail. If + * the buffer is already under the length specified it is not modified. + * The skb must be linear. + */ +void skb_trim(struct sk_buff *skb, unsigned int len) +{ + if (skb->len > len) + __skb_trim(skb, len); +} +EXPORT_SYMBOL(skb_trim); + +/* Trims skb to length len. It can change skb pointers. + */ + +int ___pskb_trim(struct sk_buff *skb, unsigned int len) +{ + struct sk_buff **fragp; + struct sk_buff *frag; + int offset = skb_headlen(skb); + int nfrags = skb_shinfo(skb)->nr_frags; + int i; + int err; + + if (skb_cloned(skb) && + unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) + return err; + + i = 0; + if (offset >= len) + goto drop_pages; + + for (; i < nfrags; i++) { + int end = offset + skb_shinfo(skb)->frags[i].size; + + if (end < len) { + offset = end; + continue; + } + + skb_shinfo(skb)->frags[i++].size = len - offset; + +drop_pages: + skb_shinfo(skb)->nr_frags = i; + + for (; i < nfrags; i++) + put_page(skb_shinfo(skb)->frags[i].page); + + if (skb_shinfo(skb)->frag_list) + skb_drop_fraglist(skb); + goto done; + } + + for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); + fragp = &frag->next) { + int end = offset + frag->len; + + if (skb_shared(frag)) { + struct sk_buff *nfrag; + + nfrag = skb_clone(frag, GFP_ATOMIC); + if (unlikely(!nfrag)) + return -ENOMEM; + + nfrag->next = frag->next; + kfree_skb(frag); + frag = nfrag; + *fragp = frag; + } + + if (end < len) { + offset = end; + continue; + } + + if (end > len && + unlikely((err = pskb_trim(frag, len - offset)))) + return err; + + if (frag->next) + skb_drop_list(&frag->next); + break; + } + +done: + if (len > skb_headlen(skb)) { + skb->data_len -= skb->len - len; + skb->len = len; + } else { + skb->len = len; + skb->data_len = 0; + skb_set_tail_pointer(skb, len); + } + + return 0; +} + +/** + * __pskb_pull_tail - advance tail of skb header + * @skb: buffer to reallocate + * @delta: number of bytes to advance tail + * + * The function makes a sense only on a fragmented &sk_buff, + * it expands header moving its tail forward and copying necessary + * data from fragmented part. + * + * &sk_buff MUST have reference count of 1. + * + * Returns %NULL (and &sk_buff does not change) if pull failed + * or value of new tail of skb in the case of success. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +/* Moves tail of skb head forward, copying data from fragmented part, + * when it is necessary. + * 1. It may fail due to malloc failure. + * 2. It may change skb pointers. + * + * It is pretty complicated. Luckily, it is called only in exceptional cases. + */ +unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +{ + /* If skb has not enough free space at tail, get new one + * plus 128 bytes for future expansions. If we have enough + * room at tail, reallocate without expansion only if skb is cloned. + */ + int i, k, eat = (skb->tail + delta) - skb->end; + + if (eat > 0 || skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, + GFP_ATOMIC)) + return NULL; + } + + if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) + BUG(); + + /* Optimization: no fragments, no reasons to preestimate + * size of pulled pages. Superb. + */ + if (!skb_shinfo(skb)->frag_list) + goto pull_pages; + + /* Estimate size of pulled pages. */ + eat = delta; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (skb_shinfo(skb)->frags[i].size >= eat) + goto pull_pages; + eat -= skb_shinfo(skb)->frags[i].size; + } + + /* If we need update frag list, we are in troubles. + * Certainly, it possible to add an offset to skb data, + * but taking into account that pulling is expected to + * be very rare operation, it is worth to fight against + * further bloating skb head and crucify ourselves here instead. + * Pure masohism, indeed. 8)8) + */ + if (eat) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + BUG_ON(!list); + + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + + if (skb_shared(list)) { + /* Sucks! We need to fork list. :-( */ + clone = skb_clone(list, GFP_ATOMIC); + if (!clone) + return NULL; + insp = list->next; + list = clone; + } else { + /* This may be pulled without + * problems. */ + insp = list; + } + if (!pskb_pull(list, eat)) { + if (clone) + kfree_skb(clone); + return NULL; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = skb_shinfo(skb)->frag_list) != insp) { + skb_shinfo(skb)->frag_list = list->next; + kfree_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + skb_shinfo(skb)->frag_list = clone; + } + } + /* Success! Now we may commit changes to skb data. */ + +pull_pages: + eat = delta; + k = 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (skb_shinfo(skb)->frags[i].size <= eat) { + put_page(skb_shinfo(skb)->frags[i].page); + eat -= skb_shinfo(skb)->frags[i].size; + } else { + skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + if (eat) { + skb_shinfo(skb)->frags[k].page_offset += eat; + skb_shinfo(skb)->frags[k].size -= eat; + eat = 0; + } + k++; + } + } + skb_shinfo(skb)->nr_frags = k; + + skb->tail += delta; + skb->data_len -= delta; + + return skb_tail_pointer(skb); +} + +/* Copy some data bits from skb to kernel buffer. */ + +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) +{ + int i, copy; + int start = skb_headlen(skb); + + if (offset > (int)skb->len - len) + goto fault; + + /* Copy header. */ + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + skb_copy_from_linear_data_offset(skb, offset, to, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + u8 *vaddr; + + if (copy > len) + copy = len; + + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + memcpy(to, + vaddr + skb_shinfo(skb)->frags[i].page_offset+ + offset - start, copy); + kunmap_skb_frag(vaddr); + + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + WARN_ON(start > offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_bits(list, offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + } + if (!len) + return 0; + +fault: + return -EFAULT; +} + +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + put_page(spd->pages[i]); +} + +static inline struct page *linear_to_page(struct page *page, unsigned int len, + unsigned int offset) +{ + struct page *p = alloc_pages(GFP_KERNEL, 0); + + if (!p) + return NULL; + memcpy(page_address(p) + offset, page_address(page) + offset, len); + + return p; +} + +/* + * Fill page/offset/length into spd, if it can hold more pages. + */ +static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, + unsigned int len, unsigned int offset, + struct sk_buff *skb, int linear) +{ + if (unlikely(spd->nr_pages == PIPE_BUFFERS)) + return 1; + + if (linear) { + page = linear_to_page(page, len, offset); + if (!page) + return 1; + } else + get_page(page); + + spd->pages[spd->nr_pages] = page; + spd->partial[spd->nr_pages].len = len; + spd->partial[spd->nr_pages].offset = offset; + spd->nr_pages++; + + return 0; +} + +static inline void __segment_seek(struct page **page, unsigned int *poff, + unsigned int *plen, unsigned int off) +{ + *poff += off; + *page += *poff / PAGE_SIZE; + *poff = *poff % PAGE_SIZE; + *plen -= off; +} + +static inline int __splice_segment(struct page *page, unsigned int poff, + unsigned int plen, unsigned int *off, + unsigned int *len, struct sk_buff *skb, + struct splice_pipe_desc *spd, int linear) +{ + if (!*len) + return 1; + + /* skip this segment if already processed */ + if (*off >= plen) { + *off -= plen; + return 0; + } + + /* ignore any bits we already processed */ + if (*off) { + __segment_seek(&page, &poff, &plen, *off); + *off = 0; + } + + do { + unsigned int flen = min(*len, plen); + + /* the linear region may spread across several pages */ + flen = min_t(unsigned int, flen, PAGE_SIZE - poff); + + if (spd_fill_page(spd, page, flen, poff, skb, linear)) + return 1; + + __segment_seek(&page, &poff, &plen, flen); + *len -= flen; + + } while (*len && plen); + + return 0; +} + +/* + * Map linear and fragment data from the skb to spd. It reports failure if the + * pipe is full or if we already spliced the requested length. + */ +static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, + unsigned int *len, + struct splice_pipe_desc *spd) +{ + int seg; + + /* + * map the linear part + */ + if (__splice_segment(virt_to_page(skb->data), + (unsigned long) skb->data & (PAGE_SIZE - 1), + skb_headlen(skb), + offset, len, skb, spd, 1)) + return 1; + + /* + * then map the fragments + */ + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { + const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; + + if (__splice_segment(f->page, f->page_offset, f->size, + offset, len, skb, spd, 0)) + return 1; + } + + return 0; +} + +/* + * Map data from the skb to a pipe. Should handle both the linear part, + * the fragments, and the frag list. It does NOT handle frag lists within + * the frag list, if such a thing exists. We'd probably need to recurse to + * handle that cleanly. + */ +int skb_splice_bits(struct sk_buff *skb, unsigned int offset, + struct pipe_inode_info *pipe, unsigned int tlen, + unsigned int flags) +{ + struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &sock_pipe_buf_ops, + .spd_release = sock_spd_release, + }; + + /* + * __skb_splice_bits() only fails if the output has no room left, + * so no point in going over the frag_list for the error case. + */ + if (__skb_splice_bits(skb, &offset, &tlen, &spd)) + goto done; + else if (!tlen) + goto done; + + /* + * now see if we have a frag_list to map + */ + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list && tlen; list = list->next) { + if (__skb_splice_bits(list, &offset, &tlen, &spd)) + break; + } + } + +done: + if (spd.nr_pages) { + struct sock *sk = skb->sk; + int ret; + + /* + * Drop the socket lock, otherwise we have reverse + * locking dependencies between sk_lock and i_mutex + * here as compared to sendfile(). We enter here + * with the socket lock held, and splice_to_pipe() will + * grab the pipe inode lock. For sendfile() emulation, + * we call into ->sendpage() with the i_mutex lock held + * and networking will grab the socket lock. + */ + release_sock(sk); + ret = splice_to_pipe(pipe, &spd); + lock_sock(sk); + return ret; + } + + return 0; +} + +/** + * skb_store_bits - store bits from kernel buffer to skb + * @skb: destination buffer + * @offset: offset in destination + * @from: source buffer + * @len: number of bytes to copy + * + * Copy the specified number of bytes from the source buffer to the + * destination skb. This function handles all the messy bits of + * traversing fragment lists and such. + */ + +int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) +{ + int i, copy; + int start = skb_headlen(skb); + + if (offset > (int)skb->len - len) + goto fault; + + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + skb_copy_to_linear_data_offset(skb, offset, from, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + int end; + + WARN_ON(start > offset + len); + + end = start + frag->size; + if ((copy = end - offset) > 0) { + u8 *vaddr; + + if (copy > len) + copy = len; + + vaddr = kmap_skb_frag(frag); + memcpy(vaddr + frag->page_offset + offset - start, + from, copy); + kunmap_skb_frag(vaddr); + + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + WARN_ON(start > offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_store_bits(list, offset - start, + from, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + start = end; + } + } + if (!len) + return 0; + +fault: + return -EFAULT; +} + +EXPORT_SYMBOL(skb_store_bits); + +/* Checksum skb data. */ + +__wsum skb_checksum(const struct sk_buff *skb, int offset, + int len, __wsum csum) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + int pos = 0; + + /* Checksum header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = csum_partial(skb->data + offset, copy, csum); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + __wsum csum2; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + csum2 = csum_partial(vaddr + frag->page_offset + + offset - start, copy, 0); + kunmap_skb_frag(vaddr); + csum = csum_block_add(csum, csum2, pos); + if (!(len -= copy)) + return csum; + offset += copy; + pos += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + WARN_ON(start > offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + __wsum csum2; + if (copy > len) + copy = len; + csum2 = skb_checksum(list, offset - start, + copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos += copy; + } + start = end; + } + } + BUG_ON(len); + + return csum; +} + +/* Both of above in one bottle. */ + +__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, + u8 *to, int len, __wsum csum) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + int pos = 0; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = csum_partial_copy_nocheck(skb->data + offset, to, + copy, csum); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + __wsum csum2; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + csum2 = csum_partial_copy_nocheck(vaddr + + frag->page_offset + + offset - start, to, + copy, 0); + kunmap_skb_frag(vaddr); + csum = csum_block_add(csum, csum2, pos); + if (!(len -= copy)) + return csum; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + __wsum csum2; + int end; + + WARN_ON(start > offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + csum2 = skb_copy_and_csum_bits(list, + offset - start, + to, copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + } + BUG_ON(len); + return csum; +} + +void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) +{ + __wsum csum; + long csstart; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + csstart = skb->csum_start - skb_headroom(skb); + else + csstart = skb_headlen(skb); + + BUG_ON(csstart > skb_headlen(skb)); + + skb_copy_from_linear_data(skb, to, csstart); + + csum = 0; + if (csstart != skb->len) + csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, + skb->len - csstart, 0); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + long csstuff = csstart + skb->csum_offset; + + *((__sum16 *)(to + csstuff)) = csum_fold(csum); + } +} + +/** + * skb_dequeue - remove from the head of the queue + * @list: list to dequeue from + * + * Remove the head of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The head item is + * returned or %NULL if the list is empty. + */ + +struct sk_buff *skb_dequeue(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} + +/** + * skb_dequeue_tail - remove from the tail of the queue + * @list: list to dequeue from + * + * Remove the tail of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The tail item is + * returned or %NULL if the list is empty. + */ +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue_tail(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} + +/** + * skb_queue_purge - empty a list + * @list: list to empty + * + * Delete all buffers on an &sk_buff list. Each buffer is removed from + * the list and one reference dropped. This function takes the list + * lock and is atomic with respect to other list locking functions. + */ +void skb_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) + kfree_skb(skb); +} + +/** + * skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_head(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} + +/** + * skb_queue_tail - queue a buffer at the list tail + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the tail of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_tail(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} + +/** + * skb_unlink - remove a buffer from a list + * @skb: buffer to remove + * @list: list to use + * + * Remove a packet from a list. The list locks are taken and this + * function is atomic with respect to other list locked calls + * + * You must know what list the SKB is on. + */ +void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_unlink(skb, list); + spin_unlock_irqrestore(&list->lock, flags); +} + +/** + * skb_append - append a buffer + * @old: buffer to insert after + * @newsk: buffer to insert + * @list: list to use + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls. + * A buffer cannot be placed on two lists at the same time. + */ +void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_after(list, old, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} + + +/** + * skb_insert - insert a buffer + * @old: buffer to insert before + * @newsk: buffer to insert + * @list: list to use + * + * Place a packet before a given packet in a list. The list locks are + * taken and this function is atomic with respect to other list locked + * calls. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_insert(newsk, old->prev, old, list); + spin_unlock_irqrestore(&list->lock, flags); +} + +static inline void skb_split_inside_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, const int pos) +{ + int i; + + skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), + pos - len); + /* And move data appendix as is. */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + skb1->data_len = skb->data_len; + skb1->len += skb1->data_len; + skb->data_len = 0; + skb->len = len; + skb_set_tail_pointer(skb, len); +} + +static inline void skb_split_no_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, int pos) +{ + int i, k = 0; + const int nfrags = skb_shinfo(skb)->nr_frags; + + skb_shinfo(skb)->nr_frags = 0; + skb1->len = skb1->data_len = skb->len - len; + skb->len = len; + skb->data_len = len - pos; + + for (i = 0; i < nfrags; i++) { + int size = skb_shinfo(skb)->frags[i].size; + + if (pos + size > len) { + skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < len) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + get_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb1)->frags[0].page_offset += len - pos; + skb_shinfo(skb1)->frags[0].size -= len - pos; + skb_shinfo(skb)->frags[i].size = len - pos; + skb_shinfo(skb)->nr_frags++; + } + k++; + } else + skb_shinfo(skb)->nr_frags++; + pos += size; + } + skb_shinfo(skb1)->nr_frags = k; +} + +/** + * skb_split - Split fragmented skb to two parts at length len. + * @skb: the buffer to split + * @skb1: the buffer to receive the second part + * @len: new length for skb + */ +void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) +{ + int pos = skb_headlen(skb); + + if (len < pos) /* Split line is inside header. */ + skb_split_inside_header(skb, skb1, len, pos); + else /* Second chunk has no header, nothing to copy. */ + skb_split_no_header(skb, skb1, len, pos); +} + +/* Shifting from/to a cloned skb is a no-go. + * + * Caller cannot keep skb_shinfo related pointers past calling here! + */ +static int skb_prepare_for_shift(struct sk_buff *skb) +{ + return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} + +/** + * skb_shift - Shifts paged data partially from skb to another + * @tgt: buffer into which tail data gets added + * @skb: buffer from which the paged data comes from + * @shiftlen: shift up to this many bytes + * + * Attempts to shift up to shiftlen worth of bytes, which may be less than + * the length of the skb, from tgt to skb. Returns number bytes shifted. + * It's up to caller to free skb if everything was shifted. + * + * If @tgt runs out of frags, the whole operation is aborted. + * + * Skb cannot include anything else but paged data while tgt is allowed + * to have non-paged data as well. + * + * TODO: full sized shift could be optimized but that would need + * specialized skb free'er to handle frags without up-to-date nr_frags. + */ +int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) +{ + int from, to, merge, todo; + struct skb_frag_struct *fragfrom, *fragto; + + BUG_ON(shiftlen > skb->len); + BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ + + todo = shiftlen; + from = 0; + to = skb_shinfo(tgt)->nr_frags; + fragfrom = &skb_shinfo(skb)->frags[from]; + + /* Actual merge is delayed until the point when we know we can + * commit all, so that we don't have to undo partial changes + */ + if (!to || + !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) { + merge = -1; + } else { + merge = to - 1; + + todo -= fragfrom->size; + if (todo < 0) { + if (skb_prepare_for_shift(skb) || + skb_prepare_for_shift(tgt)) + return 0; + + /* All previous frag pointers might be stale! */ + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + fragto->size += shiftlen; + fragfrom->size -= shiftlen; + fragfrom->page_offset += shiftlen; + + goto onlymerged; + } + + from++; + } + + /* Skip full, not-fitting skb to avoid expensive operations */ + if ((shiftlen == skb->len) && + (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) + return 0; + + if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) + return 0; + + while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { + if (to == MAX_SKB_FRAGS) + return 0; + + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[to]; + + if (todo >= fragfrom->size) { + *fragto = *fragfrom; + todo -= fragfrom->size; + from++; + to++; + + } else { + get_page(fragfrom->page); + fragto->page = fragfrom->page; + fragto->page_offset = fragfrom->page_offset; + fragto->size = todo; + + fragfrom->page_offset += todo; + fragfrom->size -= todo; + todo = 0; + + to++; + break; + } + } + + /* Ready to "commit" this state change to tgt */ + skb_shinfo(tgt)->nr_frags = to; + + if (merge >= 0) { + fragfrom = &skb_shinfo(skb)->frags[0]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + fragto->size += fragfrom->size; + put_page(fragfrom->page); + } + + /* Reposition in the original skb */ + to = 0; + while (from < skb_shinfo(skb)->nr_frags) + skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; + skb_shinfo(skb)->nr_frags = to; + + BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); + +onlymerged: + /* Most likely the tgt won't ever need its checksum anymore, skb on + * the other hand might need it if it needs to be resent + */ + tgt->ip_summed = CHECKSUM_PARTIAL; + skb->ip_summed = CHECKSUM_PARTIAL; + + /* Yak, is it really working this way? Some helper please? */ + skb->len -= shiftlen; + skb->data_len -= shiftlen; + skb->truesize -= shiftlen; + tgt->len += shiftlen; + tgt->data_len += shiftlen; + tgt->truesize += shiftlen; + + return shiftlen; +} + +/** + * skb_prepare_seq_read - Prepare a sequential read of skb data + * @skb: the buffer to read + * @from: lower offset of data to be read + * @to: upper offset of data to be read + * @st: state variable + * + * Initializes the specified state variable. Must be called before + * invoking skb_seq_read() for the first time. + */ +void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, + unsigned int to, struct skb_seq_state *st) +{ + st->lower_offset = from; + st->upper_offset = to; + st->root_skb = st->cur_skb = skb; + st->frag_idx = st->stepped_offset = 0; + st->frag_data = NULL; +} + +/** + * skb_seq_read - Sequentially read skb data + * @consumed: number of bytes consumed by the caller so far + * @data: destination pointer for data to be returned + * @st: state variable + * + * Reads a block of skb data at &consumed relative to the + * lower offset specified to skb_prepare_seq_read(). Assigns + * the head of the data block to &data and returns the length + * of the block or 0 if the end of the skb data or the upper + * offset has been reached. + * + * The caller is not required to consume all of the data + * returned, i.e. &consumed is typically set to the number + * of bytes already consumed and the next call to + * skb_seq_read() will return the remaining part of the block. + * + * Note 1: The size of each block of data returned can be arbitary, + * this limitation is the cost for zerocopy seqeuental + * reads of potentially non linear data. + * + * Note 2: Fragment lists within fragments are not implemented + * at the moment, state->root_skb could be replaced with + * a stack for this purpose. + */ +unsigned int skb_seq_read(unsigned int consumed, const u8 **data, + struct skb_seq_state *st) +{ + unsigned int block_limit, abs_offset = consumed + st->lower_offset; + skb_frag_t *frag; + + if (unlikely(abs_offset >= st->upper_offset)) + return 0; + +next_skb: + block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; + + if (abs_offset < block_limit) { + *data = st->cur_skb->data + (abs_offset - st->stepped_offset); + return block_limit - abs_offset; + } + + if (st->frag_idx == 0 && !st->frag_data) + st->stepped_offset += skb_headlen(st->cur_skb); + + while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; + block_limit = frag->size + st->stepped_offset; + + if (abs_offset < block_limit) { + if (!st->frag_data) + st->frag_data = kmap_skb_frag(frag); + + *data = (u8 *) st->frag_data + frag->page_offset + + (abs_offset - st->stepped_offset); + + return block_limit - abs_offset; + } + + if (st->frag_data) { + kunmap_skb_frag(st->frag_data); + st->frag_data = NULL; + } + + st->frag_idx++; + st->stepped_offset += frag->size; + } + + if (st->frag_data) { + kunmap_skb_frag(st->frag_data); + st->frag_data = NULL; + } + + if (st->root_skb == st->cur_skb && + skb_shinfo(st->root_skb)->frag_list) { + st->cur_skb = skb_shinfo(st->root_skb)->frag_list; + st->frag_idx = 0; + goto next_skb; + } else if (st->cur_skb->next) { + st->cur_skb = st->cur_skb->next; + st->frag_idx = 0; + goto next_skb; + } + + return 0; +} + +/** + * skb_abort_seq_read - Abort a sequential read of skb data + * @st: state variable + * + * Must be called if skb_seq_read() was not called until it + * returned 0. + */ +void skb_abort_seq_read(struct skb_seq_state *st) +{ + if (st->frag_data) + kunmap_skb_frag(st->frag_data); +} + +#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) + +static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, + struct ts_config *conf, + struct ts_state *state) +{ + return skb_seq_read(offset, text, TS_SKB_CB(state)); +} + +static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) +{ + skb_abort_seq_read(TS_SKB_CB(state)); +} + +/** + * skb_find_text - Find a text pattern in skb data + * @skb: the buffer to look in + * @from: search offset + * @to: search limit + * @config: textsearch configuration + * @state: uninitialized textsearch state variable + * + * Finds a pattern in the skb data according to the specified + * textsearch configuration. Use textsearch_next() to retrieve + * subsequent occurrences of the pattern. Returns the offset + * to the first occurrence or UINT_MAX if no match was found. + */ +unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, + unsigned int to, struct ts_config *config, + struct ts_state *state) +{ + unsigned int ret; + + config->get_next_block = skb_ts_get_next_block; + config->finish = skb_ts_finish; + + skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); + + ret = textsearch_find(config, state); + return (ret <= to - from ? ret : UINT_MAX); +} + +/** + * skb_append_datato_frags: - append the user data to a skb + * @sk: sock structure + * @skb: skb structure to be appened with user data. + * @getfrag: call back function to be used for getting the user data + * @from: pointer to user message iov + * @length: length of the iov message + * + * Description: This procedure append the user data in the fragment part + * of the skb if any page alloc fails user this procedure returns -ENOMEM + */ +int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, + int (*getfrag)(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length) +{ + int frg_cnt = 0; + skb_frag_t *frag = NULL; + struct page *page = NULL; + int copy, left; + int offset = 0; + int ret; + + do { + /* Return error if we don't have space for new frag */ + frg_cnt = skb_shinfo(skb)->nr_frags; + if (frg_cnt >= MAX_SKB_FRAGS) + return -EFAULT; + + /* allocate a new page for next frag */ + page = alloc_pages(sk->sk_allocation, 0); + + /* If alloc_page fails just return failure and caller will + * free previous allocated pages by doing kfree_skb() + */ + if (page == NULL) + return -ENOMEM; + + /* initialize the next frag */ + sk->sk_sndmsg_page = page; + sk->sk_sndmsg_off = 0; + skb_fill_page_desc(skb, frg_cnt, page, 0, 0); + skb->truesize += PAGE_SIZE; + atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); + + /* get the new initialized frag */ + frg_cnt = skb_shinfo(skb)->nr_frags; + frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; + + /* copy the user data to page */ + left = PAGE_SIZE - frag->page_offset; + copy = (length > left)? left : length; + + ret = getfrag(from, (page_address(frag->page) + + frag->page_offset + frag->size), + offset, copy, 0, skb); + if (ret < 0) + return -EFAULT; + + /* copy was successful so update the size parameters */ + sk->sk_sndmsg_off += copy; + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + offset += copy; + length -= copy; + + } while (length > 0); + + return 0; +} + +/** + * skb_pull_rcsum - pull skb and update receive checksum + * @skb: buffer to update + * @len: length of data pulled + * + * This function performs an skb_pull on the packet and updates + * the CHECKSUM_COMPLETE checksum. It should be used on + * receive path processing instead of skb_pull unless you know + * that the checksum difference is zero (e.g., a valid IP header) + * or you are setting ip_summed to CHECKSUM_NONE. + */ +unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) +{ + BUG_ON(len > skb->len); + skb->len -= len; + BUG_ON(skb->len < skb->data_len); + skb_postpull_rcsum(skb, skb->data, len); + return skb->data += len; +} + +EXPORT_SYMBOL_GPL(skb_pull_rcsum); + +/** + * skb_segment - Perform protocol segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * + * This function performs segmentation on the given skb. It returns + * a pointer to the first in a list of new skbs for the segments. + * In case of error it returns ERR_PTR(err). + */ +struct sk_buff *skb_segment(struct sk_buff *skb, int features) +{ + struct sk_buff *segs = NULL; + struct sk_buff *tail = NULL; + struct sk_buff *fskb = skb_shinfo(skb)->frag_list; + unsigned int mss = skb_shinfo(skb)->gso_size; + unsigned int doffset = skb->data - skb_mac_header(skb); + unsigned int offset = doffset; + unsigned int headroom; + unsigned int len; + int sg = features & NETIF_F_SG; + int nfrags = skb_shinfo(skb)->nr_frags; + int err = -ENOMEM; + int i = 0; + int pos; + + __skb_push(skb, doffset); + headroom = skb_headroom(skb); + pos = skb_headlen(skb); + + do { + struct sk_buff *nskb; + skb_frag_t *frag; + int hsize; + int size; + + len = skb->len - offset; + if (len > mss) + len = mss; + + hsize = skb_headlen(skb) - offset; + if (hsize < 0) + hsize = 0; + if (hsize > len || !sg) + hsize = len; + + if (!hsize && i >= nfrags) { + BUG_ON(fskb->len != len); + + pos += len; + nskb = skb_clone(fskb, GFP_ATOMIC); + fskb = fskb->next; + + if (unlikely(!nskb)) + goto err; + + hsize = skb_end_pointer(nskb) - nskb->head; + if (skb_cow_head(nskb, doffset + headroom)) { + kfree_skb(nskb); + goto err; + } + + nskb->truesize += skb_end_pointer(nskb) - nskb->head - + hsize; + skb_release_head_state(nskb); + __skb_push(nskb, doffset); + } else { + nskb = alloc_skb(hsize + doffset + headroom, + GFP_ATOMIC); + + if (unlikely(!nskb)) + goto err; + + skb_reserve(nskb, headroom); + __skb_put(nskb, doffset); + } + + if (segs) + tail->next = nskb; + else + segs = nskb; + tail = nskb; + + __copy_skb_header(nskb, skb); + nskb->mac_len = skb->mac_len; + + skb_reset_mac_header(nskb); + skb_set_network_header(nskb, skb->mac_len); + nskb->transport_header = (nskb->network_header + + skb_network_header_len(skb)); + skb_copy_from_linear_data(skb, nskb->data, doffset); + + if (pos >= offset + len) + continue; + + if (!sg) { + nskb->ip_summed = CHECKSUM_NONE; + nskb->csum = skb_copy_and_csum_bits(skb, offset, + skb_put(nskb, len), + len, 0); + continue; + } + + frag = skb_shinfo(nskb)->frags; + + skb_copy_from_linear_data_offset(skb, offset, + skb_put(nskb, hsize), hsize); + + while (pos < offset + len && i < nfrags) { + *frag = skb_shinfo(skb)->frags[i]; + get_page(frag->page); + size = frag->size; + + if (pos < offset) { + frag->page_offset += offset - pos; + frag->size -= offset - pos; + } + + skb_shinfo(nskb)->nr_frags++; + + if (pos + size <= offset + len) { + i++; + pos += size; + } else { + frag->size -= pos + size - (offset + len); + goto skip_fraglist; + } + + frag++; + } + + if (pos < offset + len) { + struct sk_buff *fskb2 = fskb; + + BUG_ON(pos + fskb->len != offset + len); + + pos += fskb->len; + fskb = fskb->next; + + if (fskb2->next) { + fskb2 = skb_clone(fskb2, GFP_ATOMIC); + if (!fskb2) + goto err; + } else + skb_get(fskb2); + + BUG_ON(skb_shinfo(nskb)->frag_list); + skb_shinfo(nskb)->frag_list = fskb2; + } + +skip_fraglist: + nskb->data_len = len - hsize; + nskb->len += nskb->data_len; + nskb->truesize += nskb->data_len; + } while ((offset += len) < skb->len); + + return segs; + +err: + while ((skb = segs)) { + segs = skb->next; + kfree_skb(skb); + } + return ERR_PTR(err); +} + +EXPORT_SYMBOL_GPL(skb_segment); + +int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct sk_buff *p = *head; + struct sk_buff *nskb; + unsigned int headroom; + unsigned int hlen = p->data - skb_mac_header(p); + unsigned int len = skb->len; + + if (hlen + p->len + len >= 65536) + return -E2BIG; + + if (skb_shinfo(p)->frag_list) + goto merge; + else if (!skb_headlen(p) && !skb_headlen(skb) && + skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags < + MAX_SKB_FRAGS) { + memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags, + skb_shinfo(skb)->frags, + skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); + + skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + + skb->truesize -= skb->data_len; + skb->len -= skb->data_len; + skb->data_len = 0; + + NAPI_GRO_CB(skb)->free = 1; + goto done; + } + + headroom = skb_headroom(p); + nskb = netdev_alloc_skb(p->dev, headroom); + if (unlikely(!nskb)) + return -ENOMEM; + + __copy_skb_header(nskb, p); + nskb->mac_len = p->mac_len; + + skb_reserve(nskb, headroom); + + skb_set_mac_header(nskb, -hlen); + skb_set_network_header(nskb, skb_network_offset(p)); + skb_set_transport_header(nskb, skb_transport_offset(p)); + + memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen); + + *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); + skb_shinfo(nskb)->frag_list = p; + skb_shinfo(nskb)->gso_size = skb_shinfo(p)->gso_size; + skb_header_release(p); + nskb->prev = p; + + nskb->data_len += p->len; + nskb->truesize += p->len; + nskb->len += p->len; + + *head = nskb; + nskb->next = p->next; + p->next = NULL; + + p = nskb; + +merge: + p->prev->next = skb; + p->prev = skb; + skb_header_release(skb); + +done: + NAPI_GRO_CB(p)->count++; + p->data_len += len; + p->truesize += len; + p->len += len; + + NAPI_GRO_CB(skb)->same_flow = 1; + return 0; +} +EXPORT_SYMBOL_GPL(skb_gro_receive); + +void __init skb_init(void) +{ + skbuff_head_cache = kmem_cache_create("skbuff_head_cache", + sizeof(struct sk_buff), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); + skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", + (2*sizeof(struct sk_buff)) + + sizeof(atomic_t), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +} + +/** + * skb_to_sgvec - Fill a scatter-gather list from a socket buffer + * @skb: Socket buffer containing the buffers to be mapped + * @sg: The scatter-gather list to map into + * @offset: The offset into the buffer's contents to start mapping + * @len: Length of buffer space to be mapped + * + * Fill the specified scatter-gather list with mappings/pointers into a + * region of the buffer space attached to a socket buffer. + */ +static int +__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + int elt = 0; + + if (copy > 0) { + if (copy > len) + copy = len; + sg_set_buf(sg, skb->data + offset, copy); + elt++; + if ((len -= copy) == 0) + return elt; + offset += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + sg_set_page(&sg[elt], frag->page, copy, + frag->page_offset+offset-start); + elt++; + if (!(len -= copy)) + return elt; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + WARN_ON(start > offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + elt += __skb_to_sgvec(list, sg+elt, offset - start, + copy); + if ((len -= copy) == 0) + return elt; + offset += copy; + } + start = end; + } + } + BUG_ON(len); + return elt; +} + +int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int nsg = __skb_to_sgvec(skb, sg, offset, len); + + sg_mark_end(&sg[nsg - 1]); + + return nsg; +} + +/** + * skb_cow_data - Check that a socket buffer's data buffers are writable + * @skb: The socket buffer to check. + * @tailbits: Amount of trailing space to be added + * @trailer: Returned pointer to the skb where the @tailbits space begins + * + * Make sure that the data buffers attached to a socket buffer are + * writable. If they are not, private copies are made of the data buffers + * and the socket buffer is set to use these instead. + * + * If @tailbits is given, make sure that there is space to write @tailbits + * bytes of data beyond current end of socket buffer. @trailer will be + * set to point to the skb in which this space begins. + * + * The number of scatterlist elements required to completely map the + * COW'd and extended socket buffer will be returned. + */ +int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) +{ + int copyflag; + int elt; + struct sk_buff *skb1, **skb_p; + + /* If skb is cloned or its head is paged, reallocate + * head pulling out all the pages (pages are considered not writable + * at the moment even if they are anonymous). + */ + if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && + __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) + return -ENOMEM; + + /* Easy case. Most of packets will go this way. */ + if (!skb_shinfo(skb)->frag_list) { + /* A little of trouble, not enough of space for trailer. + * This should not happen, when stack is tuned to generate + * good frames. OK, on miss we reallocate and reserve even more + * space, 128 bytes is fair. */ + + if (skb_tailroom(skb) < tailbits && + pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) + return -ENOMEM; + + /* Voila! */ + *trailer = skb; + return 1; + } + + /* Misery. We are in troubles, going to mincer fragments... */ + + elt = 1; + skb_p = &skb_shinfo(skb)->frag_list; + copyflag = 0; + + while ((skb1 = *skb_p) != NULL) { + int ntail = 0; + + /* The fragment is partially pulled by someone, + * this can happen on input. Copy it and everything + * after it. */ + + if (skb_shared(skb1)) + copyflag = 1; + + /* If the skb is the last, worry about trailer. */ + + if (skb1->next == NULL && tailbits) { + if (skb_shinfo(skb1)->nr_frags || + skb_shinfo(skb1)->frag_list || + skb_tailroom(skb1) < tailbits) + ntail = tailbits + 128; + } + + if (copyflag || + skb_cloned(skb1) || + ntail || + skb_shinfo(skb1)->nr_frags || + skb_shinfo(skb1)->frag_list) { + struct sk_buff *skb2; + + /* Fuck, we are miserable poor guys... */ + if (ntail == 0) + skb2 = skb_copy(skb1, GFP_ATOMIC); + else + skb2 = skb_copy_expand(skb1, + skb_headroom(skb1), + ntail, + GFP_ATOMIC); + if (unlikely(skb2 == NULL)) + return -ENOMEM; + + if (skb1->sk) + skb_set_owner_w(skb2, skb1->sk); + + /* Looking around. Are we still alive? + * OK, link new skb, drop old one */ + + skb2->next = skb1->next; + *skb_p = skb2; + kfree_skb(skb1); + skb1 = skb2; + } + elt++; + *trailer = skb1; + skb_p = &skb1->next; + } + + return elt; +} + +/** + * skb_partial_csum_set - set up and verify partial csum values for packet + * @skb: the skb to set + * @start: the number of bytes after skb->data to start checksumming. + * @off: the offset from start to place the checksum. + * + * For untrusted partially-checksummed packets, we need to make sure the values + * for skb->csum_start and skb->csum_offset are valid so we don't oops. + * + * This function checks and sets those values and skb->ip_summed: if this + * returns false you should drop the packet. + */ +bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) +{ + if (unlikely(start > skb->len - 2) || + unlikely((int)start + off > skb->len - 2)) { + if (net_ratelimit()) + printk(KERN_WARNING + "bad partial csum: csum=%u/%u len=%u\n", + start, off, skb->len); + return false; + } + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + start; + skb->csum_offset = off; + return true; +} + +void __skb_warn_lro_forwarding(const struct sk_buff *skb) +{ + if (net_ratelimit()) + pr_warning("%s: received packets cannot be forwarded" + " while LRO is enabled\n", skb->dev->name); +} + +EXPORT_SYMBOL(___pskb_trim); +EXPORT_SYMBOL(__kfree_skb); +EXPORT_SYMBOL(kfree_skb); +EXPORT_SYMBOL(__pskb_pull_tail); +EXPORT_SYMBOL(__alloc_skb); +EXPORT_SYMBOL(__netdev_alloc_skb); +EXPORT_SYMBOL(pskb_copy); +EXPORT_SYMBOL(pskb_expand_head); +EXPORT_SYMBOL(skb_checksum); +EXPORT_SYMBOL(skb_clone); +EXPORT_SYMBOL(skb_copy); +EXPORT_SYMBOL(skb_copy_and_csum_bits); +EXPORT_SYMBOL(skb_copy_and_csum_dev); +EXPORT_SYMBOL(skb_copy_bits); +EXPORT_SYMBOL(skb_copy_expand); +EXPORT_SYMBOL(skb_over_panic); +EXPORT_SYMBOL(skb_pad); +EXPORT_SYMBOL(skb_realloc_headroom); +EXPORT_SYMBOL(skb_under_panic); +EXPORT_SYMBOL(skb_dequeue); +EXPORT_SYMBOL(skb_dequeue_tail); +EXPORT_SYMBOL(skb_insert); +EXPORT_SYMBOL(skb_queue_purge); +EXPORT_SYMBOL(skb_queue_head); +EXPORT_SYMBOL(skb_queue_tail); +EXPORT_SYMBOL(skb_unlink); +EXPORT_SYMBOL(skb_append); +EXPORT_SYMBOL(skb_split); +EXPORT_SYMBOL(skb_prepare_seq_read); +EXPORT_SYMBOL(skb_seq_read); +EXPORT_SYMBOL(skb_abort_seq_read); +EXPORT_SYMBOL(skb_find_text); +EXPORT_SYMBOL(skb_append_datato_frags); +EXPORT_SYMBOL(__skb_warn_lro_forwarding); + +EXPORT_SYMBOL_GPL(skb_to_sgvec); +EXPORT_SYMBOL_GPL(skb_cow_data); +EXPORT_SYMBOL_GPL(skb_partial_csum_set); diff --git a/libdde-linux26/lib/src/net/core/utils.c b/libdde-linux26/lib/src/net/core/utils.c new file mode 100644 index 00000000..5d10a675 --- /dev/null +++ b/libdde-linux26/lib/src/net/core/utils.c @@ -0,0 +1,309 @@ +/* + * Generic address resultion entity + * + * Authors: + * net_random Alan Cox + * net_ratelimit Andi Kleen + * in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project + * + * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/inet.h> +#include <linux/mm.h> +#include <linux/net.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/random.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <net/sock.h> + +#include <asm/byteorder.h> +#include <asm/system.h> +#include <asm/uaccess.h> + +#ifndef DDE_LINUX +int net_msg_cost __read_mostly = 5*HZ; +DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); +#else +int net_msg_cost = 500; +#endif /* DDE_LINUX */ +int net_msg_burst __read_mostly = 10; +int net_msg_warn __read_mostly = 1; +EXPORT_SYMBOL(net_msg_warn); + +/* + * All net warning printk()s should be guarded by this function. + */ +int net_ratelimit(void) +{ +#ifndef DDE_LINUX + return __ratelimit(&net_ratelimit_state); +#else + return 0; +#endif +} +EXPORT_SYMBOL(net_ratelimit); + +/* + * Convert an ASCII string to binary IP. + * This is outside of net/ipv4/ because various code that uses IP addresses + * is otherwise not dependent on the TCP/IP stack. + */ + +__be32 in_aton(const char *str) +{ + unsigned long l; + unsigned int val; + int i; + + l = 0; + for (i = 0; i < 4; i++) + { + l <<= 8; + if (*str != '\0') + { + val = 0; + while (*str != '\0' && *str != '.' && *str != '\n') + { + val *= 10; + val += *str - '0'; + str++; + } + l |= val; + if (*str != '\0') + str++; + } + } + return(htonl(l)); +} + +EXPORT_SYMBOL(in_aton); + +#define IN6PTON_XDIGIT 0x00010000 +#define IN6PTON_DIGIT 0x00020000 +#define IN6PTON_COLON_MASK 0x00700000 +#define IN6PTON_COLON_1 0x00100000 /* single : requested */ +#define IN6PTON_COLON_2 0x00200000 /* second : requested */ +#define IN6PTON_COLON_1_2 0x00400000 /* :: requested */ +#define IN6PTON_DOT 0x00800000 /* . */ +#define IN6PTON_DELIM 0x10000000 +#define IN6PTON_NULL 0x20000000 /* first/tail */ +#define IN6PTON_UNKNOWN 0x40000000 + +static inline int xdigit2bin(char c, int delim) +{ + if (c == delim || c == '\0') + return IN6PTON_DELIM; + if (c == ':') + return IN6PTON_COLON_MASK; + if (c == '.') + return IN6PTON_DOT; + if (c >= '0' && c <= '9') + return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0')); + if (c >= 'a' && c <= 'f') + return (IN6PTON_XDIGIT | (c - 'a' + 10)); + if (c >= 'A' && c <= 'F') + return (IN6PTON_XDIGIT | (c - 'A' + 10)); + if (delim == -1) + return IN6PTON_DELIM; + return IN6PTON_UNKNOWN; +} + +int in4_pton(const char *src, int srclen, + u8 *dst, + int delim, const char **end) +{ + const char *s; + u8 *d; + u8 dbuf[4]; + int ret = 0; + int i; + int w = 0; + + if (srclen < 0) + srclen = strlen(src); + s = src; + d = dbuf; + i = 0; + while(1) { + int c; + c = xdigit2bin(srclen > 0 ? *s : '\0', delim); + if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) { + goto out; + } + if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + if (w == 0) + goto out; + *d++ = w & 0xff; + w = 0; + i++; + if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + if (i != 4) + goto out; + break; + } + goto cont; + } + w = (w * 10) + c; + if ((w & 0xffff) > 255) { + goto out; + } +cont: + if (i >= 4) + goto out; + s++; + srclen--; + } + ret = 1; + memcpy(dst, dbuf, sizeof(dbuf)); +out: + if (end) + *end = s; + return ret; +} + +EXPORT_SYMBOL(in4_pton); + +int in6_pton(const char *src, int srclen, + u8 *dst, + int delim, const char **end) +{ + const char *s, *tok = NULL; + u8 *d, *dc = NULL; + u8 dbuf[16]; + int ret = 0; + int i; + int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL; + int w = 0; + + memset(dbuf, 0, sizeof(dbuf)); + + s = src; + d = dbuf; + if (srclen < 0) + srclen = strlen(src); + + while (1) { + int c; + + c = xdigit2bin(srclen > 0 ? *s : '\0', delim); + if (!(c & state)) + goto out; + if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + /* process one 16-bit word */ + if (!(state & IN6PTON_NULL)) { + *d++ = (w >> 8) & 0xff; + *d++ = w & 0xff; + } + w = 0; + if (c & IN6PTON_DELIM) { + /* We've processed last word */ + break; + } + /* + * COLON_1 => XDIGIT + * COLON_2 => XDIGIT|DELIM + * COLON_1_2 => COLON_2 + */ + switch (state & IN6PTON_COLON_MASK) { + case IN6PTON_COLON_2: + dc = d; + state = IN6PTON_XDIGIT | IN6PTON_DELIM; + if (dc - dbuf >= sizeof(dbuf)) + state |= IN6PTON_NULL; + break; + case IN6PTON_COLON_1|IN6PTON_COLON_1_2: + state = IN6PTON_XDIGIT | IN6PTON_COLON_2; + break; + case IN6PTON_COLON_1: + state = IN6PTON_XDIGIT; + break; + case IN6PTON_COLON_1_2: + state = IN6PTON_COLON_2; + break; + default: + state = 0; + } + tok = s + 1; + goto cont; + } + + if (c & IN6PTON_DOT) { + ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s); + if (ret > 0) { + d += 4; + break; + } + goto out; + } + + w = (w << 4) | (0xff & c); + state = IN6PTON_COLON_1 | IN6PTON_DELIM; + if (!(w & 0xf000)) { + state |= IN6PTON_XDIGIT; + } + if (!dc && d + 2 < dbuf + sizeof(dbuf)) { + state |= IN6PTON_COLON_1_2; + state &= ~IN6PTON_DELIM; + } + if (d + 2 >= dbuf + sizeof(dbuf)) { + state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2); + } +cont: + if ((dc && d + 4 < dbuf + sizeof(dbuf)) || + d + 4 == dbuf + sizeof(dbuf)) { + state |= IN6PTON_DOT; + } + if (d >= dbuf + sizeof(dbuf)) { + state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK); + } + s++; + srclen--; + } + + i = 15; d--; + + if (dc) { + while(d >= dc) + dst[i--] = *d--; + while(i >= dc - dbuf) + dst[i--] = 0; + while(i >= 0) + dst[i--] = *d--; + } else + memcpy(dst, dbuf, sizeof(dbuf)); + + ret = 1; +out: + if (end) + *end = s; + return ret; +} + +EXPORT_SYMBOL(in6_pton); + +void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, + __be32 from, __be32 to, int pseudohdr) +{ + __be32 diff[] = { ~from, to }; + if (skb->ip_summed != CHECKSUM_PARTIAL) { + *sum = csum_fold(csum_partial(diff, sizeof(diff), + ~csum_unfold(*sum))); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_partial(diff, sizeof(diff), + ~skb->csum); + } else if (pseudohdr) + *sum = ~csum_fold(csum_partial(diff, sizeof(diff), + csum_unfold(*sum))); +} +EXPORT_SYMBOL(inet_proto_csum_replace4); diff --git a/libdde-linux26/lib/src/net/netlink/af_netlink.c b/libdde-linux26/lib/src/net/netlink/af_netlink.c new file mode 100644 index 00000000..3f00a014 --- /dev/null +++ b/libdde-linux26/lib/src/net/netlink/af_netlink.c @@ -0,0 +1,2013 @@ +/* + * NETLINK Kernel-user communication protocol. + * + * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith + * added netlink_proto_exit + * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br> + * use nlk_sk, as sk->protinfo is on a diet 8) + * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org> + * - inc module use count of module that owns + * the kernel socket in case userspace opens + * socket of same protocol + * - remove all module support, since netlink is + * mandatory if CONFIG_NET=y these days + */ + +#include <linux/module.h> + +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/un.h> +#include <linux/fcntl.h> +#include <linux/termios.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <asm/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/notifier.h> +#include <linux/security.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/random.h> +#include <linux/bitops.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/audit.h> +#include <linux/mutex.h> + +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/scm.h> +#include <net/netlink.h> + +#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) +#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) + +struct netlink_sock { + /* struct sock has to be the first member of netlink_sock */ + struct sock sk; + u32 pid; + u32 dst_pid; + u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; + unsigned long state; + wait_queue_head_t wait; + struct netlink_callback *cb; + struct mutex *cb_mutex; + struct mutex cb_def_mutex; + void (*netlink_rcv)(struct sk_buff *skb); + struct module *module; +}; + +#define NETLINK_KERNEL_SOCKET 0x1 +#define NETLINK_RECV_PKTINFO 0x2 + +static inline struct netlink_sock *nlk_sk(struct sock *sk) +{ + return container_of(sk, struct netlink_sock, sk); +} + +static inline int netlink_is_kernel(struct sock *sk) +{ + return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; +} + +struct nl_pid_hash { + struct hlist_head *table; + unsigned long rehash_time; + + unsigned int mask; + unsigned int shift; + + unsigned int entries; + unsigned int max_shift; + + u32 rnd; +}; + +struct netlink_table { + struct nl_pid_hash hash; + struct hlist_head mc_list; + unsigned long *listeners; + unsigned int nl_nonroot; + unsigned int groups; + struct mutex *cb_mutex; + struct module *module; + int registered; +}; + +static struct netlink_table *nl_table; + +static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); + +static int netlink_dump(struct sock *sk); +static void netlink_destroy_callback(struct netlink_callback *cb); + +static DEFINE_RWLOCK(nl_table_lock); +static atomic_t nl_table_users = ATOMIC_INIT(0); + +static ATOMIC_NOTIFIER_HEAD(netlink_chain); + +static u32 netlink_group_mask(u32 group) +{ + return group ? 1 << (group - 1) : 0; +} + +static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid) +{ + return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask]; +} + +static void netlink_sock_destruct(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (nlk->cb) { + if (nlk->cb->done) + nlk->cb->done(nlk->cb); + netlink_destroy_callback(nlk->cb); + } + + skb_queue_purge(&sk->sk_receive_queue); + + if (!sock_flag(sk, SOCK_DEAD)) { + printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); + return; + } + + WARN_ON(atomic_read(&sk->sk_rmem_alloc)); + WARN_ON(atomic_read(&sk->sk_wmem_alloc)); + WARN_ON(nlk_sk(sk)->groups); +} + +/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on + * SMP. Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines. + */ + +static void netlink_table_grab(void) + __acquires(nl_table_lock) +{ + write_lock_irq(&nl_table_lock); + + if (atomic_read(&nl_table_users)) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&nl_table_wait, &wait); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&nl_table_users) == 0) + break; + write_unlock_irq(&nl_table_lock); + schedule(); + write_lock_irq(&nl_table_lock); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nl_table_wait, &wait); + } +} + +static void netlink_table_ungrab(void) + __releases(nl_table_lock) +{ + write_unlock_irq(&nl_table_lock); + wake_up(&nl_table_wait); +} + +static inline void +netlink_lock_table(void) +{ + /* read_lock() synchronizes us to netlink_table_grab */ + + read_lock(&nl_table_lock); + atomic_inc(&nl_table_users); + read_unlock(&nl_table_lock); +} + +static inline void +netlink_unlock_table(void) +{ + if (atomic_dec_and_test(&nl_table_users)) + wake_up(&nl_table_wait); +} + +static inline struct sock *netlink_lookup(struct net *net, int protocol, + u32 pid) +{ + struct nl_pid_hash *hash = &nl_table[protocol].hash; + struct hlist_head *head; + struct sock *sk; + struct hlist_node *node; + + read_lock(&nl_table_lock); + head = nl_pid_hashfn(hash, pid); + sk_for_each(sk, node, head) { + if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->pid == pid)) { + sock_hold(sk); + goto found; + } + } + sk = NULL; +found: + read_unlock(&nl_table_lock); + return sk; +} + +static inline struct hlist_head *nl_pid_hash_zalloc(size_t size) +{ + if (size <= PAGE_SIZE) + return kzalloc(size, GFP_ATOMIC); + else + return (struct hlist_head *) + __get_free_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(size)); +} + +static inline void nl_pid_hash_free(struct hlist_head *table, size_t size) +{ + if (size <= PAGE_SIZE) + kfree(table); + else + free_pages((unsigned long)table, get_order(size)); +} + +static int nl_pid_hash_rehash(struct nl_pid_hash *hash, int grow) +{ + unsigned int omask, mask, shift; + size_t osize, size; + struct hlist_head *otable, *table; + int i; + + omask = mask = hash->mask; + osize = size = (mask + 1) * sizeof(*table); + shift = hash->shift; + + if (grow) { + if (++shift > hash->max_shift) + return 0; + mask = mask * 2 + 1; + size *= 2; + } + + table = nl_pid_hash_zalloc(size); + if (!table) + return 0; + + otable = hash->table; + hash->table = table; + hash->mask = mask; + hash->shift = shift; + get_random_bytes(&hash->rnd, sizeof(hash->rnd)); + + for (i = 0; i <= omask; i++) { + struct sock *sk; + struct hlist_node *node, *tmp; + + sk_for_each_safe(sk, node, tmp, &otable[i]) + __sk_add_node(sk, nl_pid_hashfn(hash, nlk_sk(sk)->pid)); + } + + nl_pid_hash_free(otable, osize); + hash->rehash_time = jiffies + 10 * 60 * HZ; + return 1; +} + +static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len) +{ + int avg = hash->entries >> hash->shift; + + if (unlikely(avg > 1) && nl_pid_hash_rehash(hash, 1)) + return 1; + + if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) { + nl_pid_hash_rehash(hash, 0); + return 1; + } + + return 0; +} + +static const struct proto_ops netlink_ops; + +static void +netlink_update_listeners(struct sock *sk) +{ + struct netlink_table *tbl = &nl_table[sk->sk_protocol]; + struct hlist_node *node; + unsigned long mask; + unsigned int i; + + for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { + mask = 0; + sk_for_each_bound(sk, node, &tbl->mc_list) { + if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) + mask |= nlk_sk(sk)->groups[i]; + } + tbl->listeners[i] = mask; + } + /* this function is only called with the netlink table "grabbed", which + * makes sure updates are visible before bind or setsockopt return. */ +} + +static int netlink_insert(struct sock *sk, struct net *net, u32 pid) +{ + struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct hlist_head *head; + int err = -EADDRINUSE; + struct sock *osk; + struct hlist_node *node; + int len; + + netlink_table_grab(); + head = nl_pid_hashfn(hash, pid); + len = 0; + sk_for_each(osk, node, head) { + if (net_eq(sock_net(osk), net) && (nlk_sk(osk)->pid == pid)) + break; + len++; + } + if (node) + goto err; + + err = -EBUSY; + if (nlk_sk(sk)->pid) + goto err; + + err = -ENOMEM; + if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX)) + goto err; + + if (len && nl_pid_hash_dilute(hash, len)) + head = nl_pid_hashfn(hash, pid); + hash->entries++; + nlk_sk(sk)->pid = pid; + sk_add_node(sk, head); + err = 0; + +err: + netlink_table_ungrab(); + return err; +} + +static void netlink_remove(struct sock *sk) +{ + netlink_table_grab(); + if (sk_del_node_init(sk)) + nl_table[sk->sk_protocol].hash.entries--; + if (nlk_sk(sk)->subscriptions) + __sk_del_bind_node(sk); + netlink_table_ungrab(); +} + +static struct proto netlink_proto = { + .name = "NETLINK", + .owner = THIS_MODULE, + .obj_size = sizeof(struct netlink_sock), +}; + +static int __netlink_create(struct net *net, struct socket *sock, + struct mutex *cb_mutex, int protocol) +{ + struct sock *sk; + struct netlink_sock *nlk; + + sock->ops = &netlink_ops; + + sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + nlk = nlk_sk(sk); + if (cb_mutex) + nlk->cb_mutex = cb_mutex; + else { + nlk->cb_mutex = &nlk->cb_def_mutex; + mutex_init(nlk->cb_mutex); + } + init_waitqueue_head(&nlk->wait); + + sk->sk_destruct = netlink_sock_destruct; + sk->sk_protocol = protocol; + return 0; +} + +static int netlink_create(struct net *net, struct socket *sock, int protocol) +{ + struct module *module = NULL; + struct mutex *cb_mutex; + struct netlink_sock *nlk; + int err = 0; + + sock->state = SS_UNCONNECTED; + + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + if (protocol < 0 || protocol >= MAX_LINKS) + return -EPROTONOSUPPORT; + + netlink_lock_table(); +#ifdef CONFIG_MODULES + if (!nl_table[protocol].registered) { + netlink_unlock_table(); + request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); + netlink_lock_table(); + } +#endif + if (nl_table[protocol].registered && + try_module_get(nl_table[protocol].module)) + module = nl_table[protocol].module; + cb_mutex = nl_table[protocol].cb_mutex; + netlink_unlock_table(); + + err = __netlink_create(net, sock, cb_mutex, protocol); + if (err < 0) + goto out_module; + + local_bh_disable(); + sock_prot_inuse_add(net, &netlink_proto, 1); + local_bh_enable(); + + nlk = nlk_sk(sock->sk); + nlk->module = module; +out: + return err; + +out_module: + module_put(module); + goto out; +} + +static int netlink_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk; + + if (!sk) + return 0; + + netlink_remove(sk); + sock_orphan(sk); + nlk = nlk_sk(sk); + + /* + * OK. Socket is unlinked, any packets that arrive now + * will be purged. + */ + + sock->sk = NULL; + wake_up_interruptible_all(&nlk->wait); + + skb_queue_purge(&sk->sk_write_queue); + + if (nlk->pid && !nlk->subscriptions) { + struct netlink_notify n = { + .net = sock_net(sk), + .protocol = sk->sk_protocol, + .pid = nlk->pid, + }; + atomic_notifier_call_chain(&netlink_chain, + NETLINK_URELEASE, &n); + } + + module_put(nlk->module); + + netlink_table_grab(); + if (netlink_is_kernel(sk)) { + BUG_ON(nl_table[sk->sk_protocol].registered == 0); + if (--nl_table[sk->sk_protocol].registered == 0) { + kfree(nl_table[sk->sk_protocol].listeners); + nl_table[sk->sk_protocol].module = NULL; + nl_table[sk->sk_protocol].registered = 0; + } + } else if (nlk->subscriptions) + netlink_update_listeners(sk); + netlink_table_ungrab(); + + kfree(nlk->groups); + nlk->groups = NULL; + + local_bh_disable(); + sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); + local_bh_enable(); + sock_put(sk); + return 0; +} + +static int netlink_autobind(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct hlist_head *head; + struct sock *osk; + struct hlist_node *node; + s32 pid = current->tgid; + int err; + static s32 rover = -4097; + +retry: + cond_resched(); + netlink_table_grab(); + head = nl_pid_hashfn(hash, pid); + sk_for_each(osk, node, head) { + if (!net_eq(sock_net(osk), net)) + continue; + if (nlk_sk(osk)->pid == pid) { + /* Bind collision, search negative pid values. */ + pid = rover--; + if (rover > -4097) + rover = -4097; + netlink_table_ungrab(); + goto retry; + } + } + netlink_table_ungrab(); + + err = netlink_insert(sk, net, pid); + if (err == -EADDRINUSE) + goto retry; + + /* If 2 threads race to autobind, that is fine. */ + if (err == -EBUSY) + err = 0; + + return err; +} + +static inline int netlink_capable(struct socket *sock, unsigned int flag) +{ + return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || + capable(CAP_NET_ADMIN); +} + +static void +netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (nlk->subscriptions && !subscriptions) + __sk_del_bind_node(sk); + else if (!nlk->subscriptions && subscriptions) + sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); + nlk->subscriptions = subscriptions; +} + +static int netlink_realloc_groups(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + unsigned int groups; + unsigned long *new_groups; + int err = 0; + + netlink_table_grab(); + + groups = nl_table[sk->sk_protocol].groups; + if (!nl_table[sk->sk_protocol].registered) { + err = -ENOENT; + goto out_unlock; + } + + if (nlk->ngroups >= groups) + goto out_unlock; + + new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC); + if (new_groups == NULL) { + err = -ENOMEM; + goto out_unlock; + } + memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0, + NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups)); + + nlk->groups = new_groups; + nlk->ngroups = groups; + out_unlock: + netlink_table_ungrab(); + return err; +} + +static int netlink_bind(struct socket *sock, struct sockaddr *addr, + int addr_len) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct netlink_sock *nlk = nlk_sk(sk); + struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; + int err; + + if (nladdr->nl_family != AF_NETLINK) + return -EINVAL; + + /* Only superuser is allowed to listen multicasts */ + if (nladdr->nl_groups) { + if (!netlink_capable(sock, NL_NONROOT_RECV)) + return -EPERM; + err = netlink_realloc_groups(sk); + if (err) + return err; + } + + if (nlk->pid) { + if (nladdr->nl_pid != nlk->pid) + return -EINVAL; + } else { + err = nladdr->nl_pid ? + netlink_insert(sk, net, nladdr->nl_pid) : + netlink_autobind(sock); + if (err) + return err; + } + + if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) + return 0; + + netlink_table_grab(); + netlink_update_subscriptions(sk, nlk->subscriptions + + hweight32(nladdr->nl_groups) - + hweight32(nlk->groups[0])); + nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; + netlink_update_listeners(sk); + netlink_table_ungrab(); + + return 0; +} + +static int netlink_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + int err = 0; + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; + + if (addr->sa_family == AF_UNSPEC) { + sk->sk_state = NETLINK_UNCONNECTED; + nlk->dst_pid = 0; + nlk->dst_group = 0; + return 0; + } + if (addr->sa_family != AF_NETLINK) + return -EINVAL; + + /* Only superuser is allowed to send multicasts */ + if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND)) + return -EPERM; + + if (!nlk->pid) + err = netlink_autobind(sock); + + if (err == 0) { + sk->sk_state = NETLINK_CONNECTED; + nlk->dst_pid = nladdr->nl_pid; + nlk->dst_group = ffs(nladdr->nl_groups); + } + + return err; +} + +static int netlink_getname(struct socket *sock, struct sockaddr *addr, + int *addr_len, int peer) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; + + nladdr->nl_family = AF_NETLINK; + nladdr->nl_pad = 0; + *addr_len = sizeof(*nladdr); + + if (peer) { + nladdr->nl_pid = nlk->dst_pid; + nladdr->nl_groups = netlink_group_mask(nlk->dst_group); + } else { + nladdr->nl_pid = nlk->pid; + nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; + } + return 0; +} + +static void netlink_overrun(struct sock *sk) +{ + if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + } +} + +static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) +{ + struct sock *sock; + struct netlink_sock *nlk; + + sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, pid); + if (!sock) + return ERR_PTR(-ECONNREFUSED); + + /* Don't bother queuing skb if kernel socket has no input function */ + nlk = nlk_sk(sock); + if (sock->sk_state == NETLINK_CONNECTED && + nlk->dst_pid != nlk_sk(ssk)->pid) { + sock_put(sock); + return ERR_PTR(-ECONNREFUSED); + } + return sock; +} + +struct sock *netlink_getsockbyfilp(struct file *filp) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct sock *sock; + + if (!S_ISSOCK(inode->i_mode)) + return ERR_PTR(-ENOTSOCK); + + sock = SOCKET_I(inode)->sk; + if (sock->sk_family != AF_NETLINK) + return ERR_PTR(-EINVAL); + + sock_hold(sock); + return sock; +} + +/* + * Attach a skb to a netlink socket. + * The caller must hold a reference to the destination socket. On error, the + * reference is dropped. The skb is not send to the destination, just all + * all error checks are performed and memory in the queue is reserved. + * Return values: + * < 0: error. skb freed, reference to sock dropped. + * 0: continue + * 1: repeat lookup - reference dropped while waiting for socket memory. + */ +int netlink_attachskb(struct sock *sk, struct sk_buff *skb, + long *timeo, struct sock *ssk) +{ + struct netlink_sock *nlk; + + nlk = nlk_sk(sk); + + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(0, &nlk->state)) { + DECLARE_WAITQUEUE(wait, current); + if (!*timeo) { + if (!ssk || netlink_is_kernel(ssk)) + netlink_overrun(sk); + sock_put(sk); + kfree_skb(skb); + return -EAGAIN; + } + + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&nlk->wait, &wait); + + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(0, &nlk->state)) && + !sock_flag(sk, SOCK_DEAD)) + *timeo = schedule_timeout(*timeo); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nlk->wait, &wait); + sock_put(sk); + + if (signal_pending(current)) { + kfree_skb(skb); + return sock_intr_errno(*timeo); + } + return 1; + } + skb_set_owner_r(skb, sk); + return 0; +} + +int netlink_sendskb(struct sock *sk, struct sk_buff *skb) +{ + int len = skb->len; + + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, len); + sock_put(sk); + return len; +} + +void netlink_detachskb(struct sock *sk, struct sk_buff *skb) +{ + kfree_skb(skb); + sock_put(sk); +} + +static inline struct sk_buff *netlink_trim(struct sk_buff *skb, + gfp_t allocation) +{ + int delta; + + skb_orphan(skb); + + delta = skb->end - skb->tail; + if (delta * 2 < skb->truesize) + return skb; + + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, allocation); + if (!nskb) + return skb; + kfree_skb(skb); + skb = nskb; + } + + if (!pskb_expand_head(skb, 0, -delta, allocation)) + skb->truesize -= delta; + + return skb; +} + +static inline void netlink_rcv_wake(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (skb_queue_empty(&sk->sk_receive_queue)) + clear_bit(0, &nlk->state); + if (!test_bit(0, &nlk->state)) + wake_up_interruptible(&nlk->wait); +} + +static inline int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb) +{ + int ret; + struct netlink_sock *nlk = nlk_sk(sk); + + ret = -ECONNREFUSED; + if (nlk->netlink_rcv != NULL) { + ret = skb->len; + skb_set_owner_r(skb, sk); + nlk->netlink_rcv(skb); + } + kfree_skb(skb); + sock_put(sk); + return ret; +} + +int netlink_unicast(struct sock *ssk, struct sk_buff *skb, + u32 pid, int nonblock) +{ + struct sock *sk; + int err; + long timeo; + + skb = netlink_trim(skb, gfp_any()); + + timeo = sock_sndtimeo(ssk, nonblock); +retry: + sk = netlink_getsockbypid(ssk, pid); + if (IS_ERR(sk)) { + kfree_skb(skb); + return PTR_ERR(sk); + } + if (netlink_is_kernel(sk)) + return netlink_unicast_kernel(sk, skb); + + if (sk_filter(sk, skb)) { + err = skb->len; + kfree_skb(skb); + sock_put(sk); + return err; + } + + err = netlink_attachskb(sk, skb, &timeo, ssk); + if (err == 1) + goto retry; + if (err) + return err; + + return netlink_sendskb(sk, skb); +} +EXPORT_SYMBOL(netlink_unicast); + +int netlink_has_listeners(struct sock *sk, unsigned int group) +{ + int res = 0; + unsigned long *listeners; + + BUG_ON(!netlink_is_kernel(sk)); + + rcu_read_lock(); + listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); + + if (group - 1 < nl_table[sk->sk_protocol].groups) + res = test_bit(group - 1, listeners); + + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL_GPL(netlink_has_listeners); + +static inline int netlink_broadcast_deliver(struct sock *sk, + struct sk_buff *skb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + !test_bit(0, &nlk->state)) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf; + } + return -1; +} + +struct netlink_broadcast_data { + struct sock *exclude_sk; + struct net *net; + u32 pid; + u32 group; + int failure; + int congested; + int delivered; + gfp_t allocation; + struct sk_buff *skb, *skb2; +}; + +static inline int do_one_broadcast(struct sock *sk, + struct netlink_broadcast_data *p) +{ + struct netlink_sock *nlk = nlk_sk(sk); + int val; + + if (p->exclude_sk == sk) + goto out; + + if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + !test_bit(p->group - 1, nlk->groups)) + goto out; + + if (!net_eq(sock_net(sk), p->net)) + goto out; + + if (p->failure) { + netlink_overrun(sk); + goto out; + } + + sock_hold(sk); + if (p->skb2 == NULL) { + if (skb_shared(p->skb)) { + p->skb2 = skb_clone(p->skb, p->allocation); + } else { + p->skb2 = skb_get(p->skb); + /* + * skb ownership may have been set when + * delivered to a previous socket. + */ + skb_orphan(p->skb2); + } + } + if (p->skb2 == NULL) { + netlink_overrun(sk); + /* Clone failed. Notify ALL listeners. */ + p->failure = 1; + } else if (sk_filter(sk, p->skb2)) { + kfree_skb(p->skb2); + p->skb2 = NULL; + } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { + netlink_overrun(sk); + } else { + p->congested |= val; + p->delivered = 1; + p->skb2 = NULL; + } + sock_put(sk); + +out: + return 0; +} + +int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, + u32 group, gfp_t allocation) +{ + struct net *net = sock_net(ssk); + struct netlink_broadcast_data info; + struct hlist_node *node; + struct sock *sk; + + skb = netlink_trim(skb, allocation); + + info.exclude_sk = ssk; + info.net = net; + info.pid = pid; + info.group = group; + info.failure = 0; + info.congested = 0; + info.delivered = 0; + info.allocation = allocation; + info.skb = skb; + info.skb2 = NULL; + + /* While we sleep in clone, do not allow to change socket list */ + + netlink_lock_table(); + + sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + do_one_broadcast(sk, &info); + + kfree_skb(skb); + + netlink_unlock_table(); + + if (info.skb2) + kfree_skb(info.skb2); + + if (info.delivered) { + if (info.congested && (allocation & __GFP_WAIT)) + yield(); + return 0; + } + if (info.failure) + return -ENOBUFS; + return -ESRCH; +} +EXPORT_SYMBOL(netlink_broadcast); + +struct netlink_set_err_data { + struct sock *exclude_sk; + u32 pid; + u32 group; + int code; +}; + +static inline int do_one_set_err(struct sock *sk, + struct netlink_set_err_data *p) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (sk == p->exclude_sk) + goto out; + + if (sock_net(sk) != sock_net(p->exclude_sk)) + goto out; + + if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + !test_bit(p->group - 1, nlk->groups)) + goto out; + + sk->sk_err = p->code; + sk->sk_error_report(sk); +out: + return 0; +} + +/** + * netlink_set_err - report error to broadcast listeners + * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() + * @pid: the PID of a process that we want to skip (if any) + * @groups: the broadcast group that will notice the error + * @code: error code, must be negative (as usual in kernelspace) + */ +void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) +{ + struct netlink_set_err_data info; + struct hlist_node *node; + struct sock *sk; + + info.exclude_sk = ssk; + info.pid = pid; + info.group = group; + /* sk->sk_err wants a positive error value */ + info.code = -code; + + read_lock(&nl_table_lock); + + sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + do_one_set_err(sk, &info); + + read_unlock(&nl_table_lock); +} + +/* must be called with netlink table grabbed */ +static void netlink_update_socket_mc(struct netlink_sock *nlk, + unsigned int group, + int is_new) +{ + int old, new = !!is_new, subscriptions; + + old = test_bit(group - 1, nlk->groups); + subscriptions = nlk->subscriptions - old + new; + if (new) + __set_bit(group - 1, nlk->groups); + else + __clear_bit(group - 1, nlk->groups); + netlink_update_subscriptions(&nlk->sk, subscriptions); + netlink_update_listeners(&nlk->sk); +} + +static int netlink_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + unsigned int val = 0; + int err; + + if (level != SOL_NETLINK) + return -ENOPROTOOPT; + + if (optlen >= sizeof(int) && + get_user(val, (unsigned int __user *)optval)) + return -EFAULT; + + switch (optname) { + case NETLINK_PKTINFO: + if (val) + nlk->flags |= NETLINK_RECV_PKTINFO; + else + nlk->flags &= ~NETLINK_RECV_PKTINFO; + err = 0; + break; + case NETLINK_ADD_MEMBERSHIP: + case NETLINK_DROP_MEMBERSHIP: { + if (!netlink_capable(sock, NL_NONROOT_RECV)) + return -EPERM; + err = netlink_realloc_groups(sk); + if (err) + return err; + if (!val || val - 1 >= nlk->ngroups) + return -EINVAL; + netlink_table_grab(); + netlink_update_socket_mc(nlk, val, + optname == NETLINK_ADD_MEMBERSHIP); + netlink_table_ungrab(); + err = 0; + break; + } + default: + err = -ENOPROTOOPT; + } + return err; +} + +static int netlink_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + int len, val, err; + + if (level != SOL_NETLINK) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case NETLINK_PKTINFO: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; + default: + err = -ENOPROTOOPT; + } + return err; +} + +static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct nl_pktinfo info; + + info.group = NETLINK_CB(skb).dst_group; + put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); +} + +static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct sockaddr_nl *addr = msg->msg_name; + u32 dst_pid; + u32 dst_group; + struct sk_buff *skb; + int err; + struct scm_cookie scm; + + if (msg->msg_flags&MSG_OOB) + return -EOPNOTSUPP; + + if (NULL == siocb->scm) + siocb->scm = &scm; + err = scm_send(sock, msg, siocb->scm); + if (err < 0) + return err; + + if (msg->msg_namelen) { + if (addr->nl_family != AF_NETLINK) + return -EINVAL; + dst_pid = addr->nl_pid; + dst_group = ffs(addr->nl_groups); + if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) + return -EPERM; + } else { + dst_pid = nlk->dst_pid; + dst_group = nlk->dst_group; + } + + if (!nlk->pid) { + err = netlink_autobind(sock); + if (err) + goto out; + } + + err = -EMSGSIZE; + if (len > sk->sk_sndbuf - 32) + goto out; + err = -ENOBUFS; + skb = alloc_skb(len, GFP_KERNEL); + if (skb == NULL) + goto out; + + NETLINK_CB(skb).pid = nlk->pid; + NETLINK_CB(skb).dst_group = dst_group; + NETLINK_CB(skb).loginuid = audit_get_loginuid(current); + NETLINK_CB(skb).sessionid = audit_get_sessionid(current); + security_task_getsecid(current, &(NETLINK_CB(skb).sid)); + memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); + + /* What can I do? Netlink is asynchronous, so that + we will have to save current capabilities to + check them, when this message will be delivered + to corresponding kernel module. --ANK (980802) + */ + + err = -EFAULT; + if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + kfree_skb(skb); + goto out; + } + + err = security_netlink_send(sk, skb); + if (err) { + kfree_skb(skb); + goto out; + } + + if (dst_group) { + atomic_inc(&skb->users); + netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); + } + err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); + +out: + return err; +} + +static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len, + int flags) +{ + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct scm_cookie scm; + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + int noblock = flags&MSG_DONTWAIT; + size_t copied; + struct sk_buff *skb; + int err; + + if (flags&MSG_OOB) + return -EOPNOTSUPP; + + copied = 0; + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (skb == NULL) + goto out; + + msg->msg_namelen = 0; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + skb_reset_transport_header(skb); + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + if (msg->msg_name) { + struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name; + addr->nl_family = AF_NETLINK; + addr->nl_pad = 0; + addr->nl_pid = NETLINK_CB(skb).pid; + addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); + msg->msg_namelen = sizeof(*addr); + } + + if (nlk->flags & NETLINK_RECV_PKTINFO) + netlink_cmsg_recv_pktinfo(msg, skb); + + if (NULL == siocb->scm) { + memset(&scm, 0, sizeof(scm)); + siocb->scm = &scm; + } + siocb->scm->creds = *NETLINK_CREDS(skb); + if (flags & MSG_TRUNC) + copied = skb->len; + skb_free_datagram(sk, skb); + + if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) + netlink_dump(sk); + + scm_recv(sock, msg, siocb->scm, flags); +out: + netlink_rcv_wake(sk); + return err ? : copied; +} + +static void netlink_data_ready(struct sock *sk, int len) +{ + BUG(); +} + +/* + * We export these functions to other modules. They provide a + * complete set of kernel non-blocking support for message + * queueing. + */ + +struct sock * +netlink_kernel_create(struct net *net, int unit, unsigned int groups, + void (*input)(struct sk_buff *skb), + struct mutex *cb_mutex, struct module *module) +{ + struct socket *sock; + struct sock *sk; + struct netlink_sock *nlk; + unsigned long *listeners = NULL; + + BUG_ON(!nl_table); + + if (unit < 0 || unit >= MAX_LINKS) + return NULL; + + if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) + return NULL; + + /* + * We have to just have a reference on the net from sk, but don't + * get_net it. Besides, we cannot get and then put the net here. + * So we create one inside init_net and the move it to net. + */ + + if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0) + goto out_sock_release_nosk; + + sk = sock->sk; + sk_change_net(sk, net); + + if (groups < 32) + groups = 32; + + listeners = kzalloc(NLGRPSZ(groups), GFP_KERNEL); + if (!listeners) + goto out_sock_release; + + sk->sk_data_ready = netlink_data_ready; + if (input) + nlk_sk(sk)->netlink_rcv = input; + + if (netlink_insert(sk, net, 0)) + goto out_sock_release; + + nlk = nlk_sk(sk); + nlk->flags |= NETLINK_KERNEL_SOCKET; + + netlink_table_grab(); + if (!nl_table[unit].registered) { + nl_table[unit].groups = groups; + nl_table[unit].listeners = listeners; + nl_table[unit].cb_mutex = cb_mutex; + nl_table[unit].module = module; + nl_table[unit].registered = 1; + } else { + kfree(listeners); + nl_table[unit].registered++; + } + netlink_table_ungrab(); + return sk; + +out_sock_release: + kfree(listeners); + netlink_kernel_release(sk); + return NULL; + +out_sock_release_nosk: + sock_release(sock); + return NULL; +} +EXPORT_SYMBOL(netlink_kernel_create); + + +void +netlink_kernel_release(struct sock *sk) +{ + sk_release_kernel(sk); +} +EXPORT_SYMBOL(netlink_kernel_release); + + +/** + * netlink_change_ngroups - change number of multicast groups + * + * This changes the number of multicast groups that are available + * on a certain netlink family. Note that it is not possible to + * change the number of groups to below 32. Also note that it does + * not implicitly call netlink_clear_multicast_users() when the + * number of groups is reduced. + * + * @sk: The kernel netlink socket, as returned by netlink_kernel_create(). + * @groups: The new number of groups. + */ +int netlink_change_ngroups(struct sock *sk, unsigned int groups) +{ + unsigned long *listeners, *old = NULL; + struct netlink_table *tbl = &nl_table[sk->sk_protocol]; + int err = 0; + + if (groups < 32) + groups = 32; + + netlink_table_grab(); + if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { + listeners = kzalloc(NLGRPSZ(groups), GFP_ATOMIC); + if (!listeners) { + err = -ENOMEM; + goto out_ungrab; + } + old = tbl->listeners; + memcpy(listeners, old, NLGRPSZ(tbl->groups)); + rcu_assign_pointer(tbl->listeners, listeners); + } + tbl->groups = groups; + + out_ungrab: + netlink_table_ungrab(); + synchronize_rcu(); + kfree(old); + return err; +} +EXPORT_SYMBOL(netlink_change_ngroups); + +/** + * netlink_clear_multicast_users - kick off multicast listeners + * + * This function removes all listeners from the given group. + * @ksk: The kernel netlink socket, as returned by + * netlink_kernel_create(). + * @group: The multicast group to clear. + */ +void netlink_clear_multicast_users(struct sock *ksk, unsigned int group) +{ + struct sock *sk; + struct hlist_node *node; + struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; + + netlink_table_grab(); + + sk_for_each_bound(sk, node, &tbl->mc_list) + netlink_update_socket_mc(nlk_sk(sk), group, 0); + + netlink_table_ungrab(); +} +EXPORT_SYMBOL(netlink_clear_multicast_users); + +void netlink_set_nonroot(int protocol, unsigned int flags) +{ + if ((unsigned int)protocol < MAX_LINKS) + nl_table[protocol].nl_nonroot = flags; +} +EXPORT_SYMBOL(netlink_set_nonroot); + +static void netlink_destroy_callback(struct netlink_callback *cb) +{ + if (cb->skb) + kfree_skb(cb->skb); + kfree(cb); +} + +/* + * It looks a bit ugly. + * It would be better to create kernel thread. + */ + +static int netlink_dump(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_callback *cb; + struct sk_buff *skb; + struct nlmsghdr *nlh; + int len, err = -ENOBUFS; + + skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); + if (!skb) + goto errout; + + mutex_lock(nlk->cb_mutex); + + cb = nlk->cb; + if (cb == NULL) { + err = -EINVAL; + goto errout_skb; + } + + len = cb->dump(skb, cb); + + if (len > 0) { + mutex_unlock(nlk->cb_mutex); + + if (sk_filter(sk, skb)) + kfree_skb(skb); + else { + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + } + return 0; + } + + nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI); + if (!nlh) + goto errout_skb; + + memcpy(nlmsg_data(nlh), &len, sizeof(len)); + + if (sk_filter(sk, skb)) + kfree_skb(skb); + else { + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + } + + if (cb->done) + cb->done(cb); + nlk->cb = NULL; + mutex_unlock(nlk->cb_mutex); + + netlink_destroy_callback(cb); + return 0; + +errout_skb: + mutex_unlock(nlk->cb_mutex); + kfree_skb(skb); +errout: + return err; +} + +int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + struct nlmsghdr *nlh, + int (*dump)(struct sk_buff *skb, + struct netlink_callback *), + int (*done)(struct netlink_callback *)) +{ +#ifdef DDE_LINUX + return -ENOBUFS; +#else + struct netlink_callback *cb; + struct sock *sk; + struct netlink_sock *nlk; + + cb = kzalloc(sizeof(*cb), GFP_KERNEL); + if (cb == NULL) + return -ENOBUFS; + + cb->dump = dump; + cb->done = done; + cb->nlh = nlh; + atomic_inc(&skb->users); + cb->skb = skb; + + sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).pid); + if (sk == NULL) { + netlink_destroy_callback(cb); + return -ECONNREFUSED; + } + nlk = nlk_sk(sk); + /* A dump is in progress... */ + mutex_lock(nlk->cb_mutex); + if (nlk->cb) { + mutex_unlock(nlk->cb_mutex); + netlink_destroy_callback(cb); + sock_put(sk); + return -EBUSY; + } + nlk->cb = cb; + mutex_unlock(nlk->cb_mutex); + + netlink_dump(sk); + sock_put(sk); + + /* We successfully started a dump, by returning -EINTR we + * signal not to send ACK even if it was requested. + */ + return -EINTR; +#endif +} +EXPORT_SYMBOL(netlink_dump_start); + +void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) +{ + struct sk_buff *skb; + struct nlmsghdr *rep; + struct nlmsgerr *errmsg; + size_t payload = sizeof(*errmsg); + + /* error messages get the original request appened */ + if (err) + payload += nlmsg_len(nlh); + + skb = nlmsg_new(payload, GFP_KERNEL); + if (!skb) { + struct sock *sk; + + sk = netlink_lookup(sock_net(in_skb->sk), + in_skb->sk->sk_protocol, + NETLINK_CB(in_skb).pid); + if (sk) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + sock_put(sk); + } + return; + } + + rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + NLMSG_ERROR, sizeof(struct nlmsgerr), 0); + errmsg = nlmsg_data(rep); + errmsg->error = err; + memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh)); + netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); +} +EXPORT_SYMBOL(netlink_ack); + +int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, + struct nlmsghdr *)) +{ + struct nlmsghdr *nlh; + int err; + + while (skb->len >= nlmsg_total_size(0)) { + int msglen; + + nlh = nlmsg_hdr(skb); + err = 0; + + if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) + return 0; + + /* Only requests are handled by the kernel */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) + goto ack; + + /* Skip control messages */ + if (nlh->nlmsg_type < NLMSG_MIN_TYPE) + goto ack; + + err = cb(skb, nlh); + if (err == -EINTR) + goto skip; + +ack: + if (nlh->nlmsg_flags & NLM_F_ACK || err) + netlink_ack(skb, nlh, err); + +skip: + msglen = NLMSG_ALIGN(nlh->nlmsg_len); + if (msglen > skb->len) + msglen = skb->len; + skb_pull(skb, msglen); + } + + return 0; +} +EXPORT_SYMBOL(netlink_rcv_skb); + +/** + * nlmsg_notify - send a notification netlink message + * @sk: netlink socket to use + * @skb: notification message + * @pid: destination netlink pid for reports or 0 + * @group: destination multicast group or 0 + * @report: 1 to report back, 0 to disable + * @flags: allocation flags + */ +int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid, + unsigned int group, int report, gfp_t flags) +{ + int err = 0; + + if (group) { + int exclude_pid = 0; + + if (report) { + atomic_inc(&skb->users); + exclude_pid = pid; + } + + /* errors reported via destination sk->sk_err */ + nlmsg_multicast(sk, skb, exclude_pid, group, flags); + } + + if (report) + err = nlmsg_unicast(sk, skb, pid); + + return err; +} +EXPORT_SYMBOL(nlmsg_notify); + +#ifdef CONFIG_PROC_FS +struct nl_seq_iter { + struct seq_net_private p; + int link; + int hash_idx; +}; + +static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) +{ + struct nl_seq_iter *iter = seq->private; + int i, j; + struct sock *s; + struct hlist_node *node; + loff_t off = 0; + + for (i = 0; i < MAX_LINKS; i++) { + struct nl_pid_hash *hash = &nl_table[i].hash; + + for (j = 0; j <= hash->mask; j++) { + sk_for_each(s, node, &hash->table[j]) { + if (sock_net(s) != seq_file_net(seq)) + continue; + if (off == pos) { + iter->link = i; + iter->hash_idx = j; + return s; + } + ++off; + } + } + } + return NULL; +} + +static void *netlink_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(nl_table_lock) +{ + read_lock(&nl_table_lock); + return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *s; + struct nl_seq_iter *iter; + int i, j; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return netlink_seq_socket_idx(seq, 0); + + iter = seq->private; + s = v; + do { + s = sk_next(s); + } while (s && sock_net(s) != seq_file_net(seq)); + if (s) + return s; + + i = iter->link; + j = iter->hash_idx + 1; + + do { + struct nl_pid_hash *hash = &nl_table[i].hash; + + for (; j <= hash->mask; j++) { + s = sk_head(&hash->table[j]); + while (s && sock_net(s) != seq_file_net(seq)) + s = sk_next(s); + if (s) { + iter->link = i; + iter->hash_idx = j; + return s; + } + } + + j = 0; + } while (++i < MAX_LINKS); + + return NULL; +} + +static void netlink_seq_stop(struct seq_file *seq, void *v) + __releases(nl_table_lock) +{ + read_unlock(&nl_table_lock); +} + + +static int netlink_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "sk Eth Pid Groups " + "Rmem Wmem Dump Locks\n"); + else { + struct sock *s = v; + struct netlink_sock *nlk = nlk_sk(s); + + seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %d\n", + s, + s->sk_protocol, + nlk->pid, + nlk->groups ? (u32)nlk->groups[0] : 0, + atomic_read(&s->sk_rmem_alloc), + atomic_read(&s->sk_wmem_alloc), + nlk->cb, + atomic_read(&s->sk_refcnt) + ); + + } + return 0; +} + +static const struct seq_operations netlink_seq_ops = { + .start = netlink_seq_start, + .next = netlink_seq_next, + .stop = netlink_seq_stop, + .show = netlink_seq_show, +}; + + +static int netlink_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &netlink_seq_ops, + sizeof(struct nl_seq_iter)); +} + +static const struct file_operations netlink_seq_fops = { + .owner = THIS_MODULE, + .open = netlink_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + +int netlink_register_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&netlink_chain, nb); +} +EXPORT_SYMBOL(netlink_register_notifier); + +int netlink_unregister_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&netlink_chain, nb); +} +EXPORT_SYMBOL(netlink_unregister_notifier); + +static const struct proto_ops netlink_ops = { + .family = PF_NETLINK, + .owner = THIS_MODULE, + .release = netlink_release, + .bind = netlink_bind, + .connect = netlink_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = netlink_getname, + .poll = datagram_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = netlink_setsockopt, + .getsockopt = netlink_getsockopt, + .sendmsg = netlink_sendmsg, + .recvmsg = netlink_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct net_proto_family netlink_family_ops = { + .family = PF_NETLINK, + .create = netlink_create, + .owner = THIS_MODULE, /* for consistency 8) */ +}; + +static int __net_init netlink_net_init(struct net *net) +{ +#ifdef CONFIG_PROC_FS + if (!proc_net_fops_create(net, "netlink", 0, &netlink_seq_fops)) + return -ENOMEM; +#endif + return 0; +} + +static void __net_exit netlink_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove(net, "netlink"); +#endif +} + +static struct pernet_operations __net_initdata netlink_net_ops = { + .init = netlink_net_init, + .exit = netlink_net_exit, +}; + +static int __init netlink_proto_init(void) +{ + struct sk_buff *dummy_skb; + int i; + unsigned long limit; + unsigned int order; + int err = proto_register(&netlink_proto, 0); + + if (err != 0) + goto out; + + BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb)); + + nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); + if (!nl_table) + goto panic; + + if (num_physpages >= (128 * 1024)) + limit = num_physpages >> (21 - PAGE_SHIFT); + else + limit = num_physpages >> (23 - PAGE_SHIFT); + + order = get_bitmask_order(limit) - 1 + PAGE_SHIFT; + limit = (1UL << order) / sizeof(struct hlist_head); + order = get_bitmask_order(min(limit, (unsigned long)UINT_MAX)) - 1; + + for (i = 0; i < MAX_LINKS; i++) { + struct nl_pid_hash *hash = &nl_table[i].hash; + + hash->table = nl_pid_hash_zalloc(1 * sizeof(*hash->table)); + if (!hash->table) { + while (i-- > 0) + nl_pid_hash_free(nl_table[i].hash.table, + 1 * sizeof(*hash->table)); + kfree(nl_table); + goto panic; + } + hash->max_shift = order; + hash->shift = 0; + hash->mask = 0; + hash->rehash_time = jiffies; + } + + sock_register(&netlink_family_ops); + register_pernet_subsys(&netlink_net_ops); + /* The netlink device handler may be needed early. */ + rtnetlink_init(); +out: + return err; +panic: + panic("netlink_init: Cannot allocate nl_table\n"); +} + +core_initcall(netlink_proto_init); diff --git a/libdde-linux26/lib/src/net/sched/sch_generic.c b/libdde-linux26/lib/src/net/sched/sch_generic.c new file mode 100644 index 00000000..a2acd6c4 --- /dev/null +++ b/libdde-linux26/lib/src/net/sched/sch_generic.c @@ -0,0 +1,749 @@ +/* + * net/sched/sch_generic.c Generic packet scheduler routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Jamal Hadi Salim, <hadi@cyberus.ca> 990601 + * - Ingress support + */ + +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/rcupdate.h> +#include <linux/list.h> +#include <net/pkt_sched.h> + +#ifdef DDE_LINUX +#include "local.h" +#endif + +/* Main transmission queue. */ + +/* Modifications to data participating in scheduling must be protected with + * qdisc_lock(qdisc) spinlock. + * + * The idea is the following: + * - enqueue, dequeue are serialized via qdisc root lock + * - ingress filtering is also serialized via qdisc root lock + * - updates to tree and tree walking are only done under the rtnl mutex. + */ + +static inline int qdisc_qlen(struct Qdisc *q) +{ + return q->q.qlen; +} + +static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +{ + q->gso_skb = skb; + q->qstats.requeues++; + __netif_schedule(q); + + return 0; +} + +static inline struct sk_buff *dequeue_skb(struct Qdisc *q) +{ + struct sk_buff *skb = q->gso_skb; + + if (unlikely(skb)) { + struct net_device *dev = qdisc_dev(q); + struct netdev_queue *txq; + + /* check the reason of requeuing without tx lock first */ + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq)) + q->gso_skb = NULL; + else + skb = NULL; + } else { + skb = q->dequeue(q); + } + + return skb; +} + +static inline int handle_dev_cpu_collision(struct sk_buff *skb, + struct netdev_queue *dev_queue, + struct Qdisc *q) +{ + int ret; + + if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) { + /* + * Same CPU holding the lock. It may be a transient + * configuration error, when hard_start_xmit() recurses. We + * detect it by checking xmit owner and drop the packet when + * deadloop is detected. Return OK to try the next skb. + */ + kfree_skb(skb); + if (net_ratelimit()) + printk(KERN_WARNING "Dead loop on netdevice %s, " + "fix it urgently!\n", dev_queue->dev->name); + ret = qdisc_qlen(q); + } else { + /* + * Another cpu is holding lock, requeue & delay xmits for + * some time. + */ + __get_cpu_var(netdev_rx_stat).cpu_collision++; + ret = dev_requeue_skb(skb, q); + } + + return ret; +} + +/* + * NOTE: Called under qdisc_lock(q) with locally disabled BH. + * + * __QDISC_STATE_RUNNING guarantees only one CPU can process + * this qdisc at a time. qdisc_lock(q) serializes queue accesses for + * this queue. + * + * netif_tx_lock serializes accesses to device driver. + * + * qdisc_lock(q) and netif_tx_lock are mutually exclusive, + * if one is grabbed, another must be free. + * + * Note, that this procedure can be called by a watchdog timer + * + * Returns to the caller: + * 0 - queue is empty or throttled. + * >0 - queue is not empty. + * + */ +static inline int qdisc_restart(struct Qdisc *q) +{ + struct netdev_queue *txq; + int ret = NETDEV_TX_BUSY; + struct net_device *dev; + spinlock_t *root_lock; + struct sk_buff *skb; + + /* Dequeue packet */ + if (unlikely((skb = dequeue_skb(q)) == NULL)) + return 0; + + root_lock = qdisc_lock(q); + + /* And release qdisc */ + spin_unlock(root_lock); + + dev = qdisc_dev(q); + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_tx_queue_stopped(txq) && + !netif_tx_queue_frozen(txq)) + ret = dev_hard_start_xmit(skb, dev, txq); + HARD_TX_UNLOCK(dev, txq); + + spin_lock(root_lock); + + switch (ret) { + case NETDEV_TX_OK: + /* Driver sent out skb successfully */ + ret = qdisc_qlen(q); + break; + + case NETDEV_TX_LOCKED: + /* Driver try lock failed */ + ret = handle_dev_cpu_collision(skb, txq, q); + break; + + default: + /* Driver returned NETDEV_TX_BUSY - requeue skb */ + if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit())) + printk(KERN_WARNING "BUG %s code %d qlen %d\n", + dev->name, ret, q->q.qlen); + + ret = dev_requeue_skb(skb, q); + break; + } + + if (ret && (netif_tx_queue_stopped(txq) || + netif_tx_queue_frozen(txq))) + ret = 0; + + return ret; +} + +void __qdisc_run(struct Qdisc *q) +{ + unsigned long start_time = jiffies; + + while (qdisc_restart(q)) { + /* + * Postpone processing if + * 1. another process needs the CPU; + * 2. we've been doing it for too long. + */ + if (need_resched() || jiffies != start_time) { + __netif_schedule(q); + break; + } + } + + clear_bit(__QDISC_STATE_RUNNING, &q->state); +} + +static void dev_watchdog(unsigned long arg) +{ + struct net_device *dev = (struct net_device *)arg; + + netif_tx_lock(dev); + if (!qdisc_tx_is_noop(dev)) { + if (netif_device_present(dev) && + netif_running(dev) && + netif_carrier_ok(dev)) { + int some_queue_stopped = 0; + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq; + + txq = netdev_get_tx_queue(dev, i); + if (netif_tx_queue_stopped(txq)) { + some_queue_stopped = 1; + break; + } + } + + if (some_queue_stopped && + time_after(jiffies, (dev->trans_start + + dev->watchdog_timeo))) { + char drivername[64]; + WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit timed out\n", + dev->name, netdev_drivername(dev, drivername, 64)); + dev->netdev_ops->ndo_tx_timeout(dev); + } + if (!mod_timer(&dev->watchdog_timer, + round_jiffies(jiffies + + dev->watchdog_timeo))) + dev_hold(dev); + } + } + netif_tx_unlock(dev); + + dev_put(dev); +} + +void __netdev_watchdog_up(struct net_device *dev) +{ + if (dev->netdev_ops->ndo_tx_timeout) { + if (dev->watchdog_timeo <= 0) + dev->watchdog_timeo = 5*HZ; + if (!mod_timer(&dev->watchdog_timer, + round_jiffies(jiffies + dev->watchdog_timeo))) + dev_hold(dev); + } +} + +static void dev_watchdog_up(struct net_device *dev) +{ + __netdev_watchdog_up(dev); +} + +static void dev_watchdog_down(struct net_device *dev) +{ + netif_tx_lock_bh(dev); + if (del_timer(&dev->watchdog_timer)) + dev_put(dev); + netif_tx_unlock_bh(dev); +} + +/** + * netif_carrier_on - set carrier + * @dev: network device + * + * Device has detected that carrier. + */ +void netif_carrier_on(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { + if (dev->reg_state == NETREG_UNINITIALIZED) + return; + linkwatch_fire_event(dev); + if (netif_running(dev)) + __netdev_watchdog_up(dev); + } +} +EXPORT_SYMBOL(netif_carrier_on); + +/** + * netif_carrier_off - clear carrier + * @dev: network device + * + * Device has detected loss of carrier. + */ +void netif_carrier_off(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { + if (dev->reg_state == NETREG_UNINITIALIZED) + return; + linkwatch_fire_event(dev); + } +} +EXPORT_SYMBOL(netif_carrier_off); + +/* "NOOP" scheduler: the best scheduler, recommended for all interfaces + under all circumstances. It is difficult to invent anything faster or + cheaper. + */ + +static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +{ + kfree_skb(skb); + return NET_XMIT_CN; +} + +static struct sk_buff *noop_dequeue(struct Qdisc * qdisc) +{ + return NULL; +} + +struct Qdisc_ops noop_qdisc_ops __read_mostly = { + .id = "noop", + .priv_size = 0, + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .peek = noop_dequeue, + .owner = THIS_MODULE, +}; + +static struct netdev_queue noop_netdev_queue = { + .qdisc = &noop_qdisc, + .qdisc_sleeping = &noop_qdisc, +}; + +struct Qdisc noop_qdisc = { + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .flags = TCQ_F_BUILTIN, + .ops = &noop_qdisc_ops, + .list = LIST_HEAD_INIT(noop_qdisc.list), + .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), + .dev_queue = &noop_netdev_queue, +}; +EXPORT_SYMBOL(noop_qdisc); + +static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { + .id = "noqueue", + .priv_size = 0, + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .peek = noop_dequeue, + .owner = THIS_MODULE, +}; + +static struct Qdisc noqueue_qdisc; +static struct netdev_queue noqueue_netdev_queue = { + .qdisc = &noqueue_qdisc, + .qdisc_sleeping = &noqueue_qdisc, +}; + +static struct Qdisc noqueue_qdisc = { + .enqueue = NULL, + .dequeue = noop_dequeue, + .flags = TCQ_F_BUILTIN, + .ops = &noqueue_qdisc_ops, + .list = LIST_HEAD_INIT(noqueue_qdisc.list), + .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), + .dev_queue = &noqueue_netdev_queue, +}; + + +static const u8 prio2band[TC_PRIO_MAX+1] = + { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; + +/* 3-band FIFO queue: old style, but should be a bit faster than + generic prio+fifo combination. + */ + +#define PFIFO_FAST_BANDS 3 + +static inline struct sk_buff_head *prio2list(struct sk_buff *skb, + struct Qdisc *qdisc) +{ + struct sk_buff_head *list = qdisc_priv(qdisc); + return list + prio2band[skb->priority & TC_PRIO_MAX]; +} + +static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + struct sk_buff_head *list = prio2list(skb, qdisc); + + if (skb_queue_len(list) < qdisc_dev(qdisc)->tx_queue_len) { + qdisc->q.qlen++; + return __qdisc_enqueue_tail(skb, qdisc, list); + } + + return qdisc_drop(skb, qdisc); +} + +static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = qdisc_priv(qdisc); + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + if (!skb_queue_empty(list + prio)) { + qdisc->q.qlen--; + return __qdisc_dequeue_head(qdisc, list + prio); + } + } + + return NULL; +} + +static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = qdisc_priv(qdisc); + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + if (!skb_queue_empty(list + prio)) + return skb_peek(list + prio); + } + + return NULL; +} + +static void pfifo_fast_reset(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = qdisc_priv(qdisc); + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) + __qdisc_reset_queue(qdisc, list + prio); + + qdisc->qstats.backlog = 0; + qdisc->q.qlen = 0; +} + +static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) +{ + struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; + +#ifndef DDE_LINUX + memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); +#else + WARN_UNIMPL; +#endif + return skb->len; + +nla_put_failure: + return -1; +} + +static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) +{ + int prio; + struct sk_buff_head *list = qdisc_priv(qdisc); + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) + skb_queue_head_init(list + prio); + + return 0; +} + +static struct Qdisc_ops pfifo_fast_ops __read_mostly = { + .id = "pfifo_fast", + .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head), + .enqueue = pfifo_fast_enqueue, + .dequeue = pfifo_fast_dequeue, + .peek = pfifo_fast_peek, + .init = pfifo_fast_init, + .reset = pfifo_fast_reset, + .dump = pfifo_fast_dump, + .owner = THIS_MODULE, +}; + +struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + struct Qdisc_ops *ops) +{ + void *p; + struct Qdisc *sch; + unsigned int size; + int err = -ENOBUFS; + + /* ensure that the Qdisc and the private data are 32-byte aligned */ + size = QDISC_ALIGN(sizeof(*sch)); + size += ops->priv_size + (QDISC_ALIGNTO - 1); + + p = kzalloc(size, GFP_KERNEL); + if (!p) + goto errout; + sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); + sch->padded = (char *) sch - (char *) p; + + INIT_LIST_HEAD(&sch->list); + skb_queue_head_init(&sch->q); + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev_queue = dev_queue; + dev_hold(qdisc_dev(sch)); + atomic_set(&sch->refcnt, 1); + + return sch; +errout: + return ERR_PTR(err); +} + +struct Qdisc * qdisc_create_dflt(struct net_device *dev, + struct netdev_queue *dev_queue, + struct Qdisc_ops *ops, + unsigned int parentid) +{ + struct Qdisc *sch; + + sch = qdisc_alloc(dev_queue, ops); + if (IS_ERR(sch)) + goto errout; + sch->parent = parentid; + + if (!ops->init || ops->init(sch, NULL) == 0) + return sch; + + qdisc_destroy(sch); +errout: + return NULL; +} +EXPORT_SYMBOL(qdisc_create_dflt); + +/* Under qdisc_lock(qdisc) and BH! */ + +void qdisc_reset(struct Qdisc *qdisc) +{ + const struct Qdisc_ops *ops = qdisc->ops; + + if (ops->reset) + ops->reset(qdisc); + + kfree_skb(qdisc->gso_skb); + qdisc->gso_skb = NULL; +} +EXPORT_SYMBOL(qdisc_reset); + +void qdisc_destroy(struct Qdisc *qdisc) +{ + const struct Qdisc_ops *ops = qdisc->ops; + + if (qdisc->flags & TCQ_F_BUILTIN || + !atomic_dec_and_test(&qdisc->refcnt)) + return; + +#ifdef CONFIG_NET_SCHED +#ifndef DDE_LINUX + qdisc_list_del(qdisc); + + qdisc_put_stab(qdisc->stab); + gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); +#endif +#endif + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + + module_put(ops->owner); + dev_put(qdisc_dev(qdisc)); + + kfree_skb(qdisc->gso_skb); + kfree((char *) qdisc - qdisc->padded); +} +EXPORT_SYMBOL(qdisc_destroy); + +static bool dev_all_qdisc_sleeping_noop(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + + if (txq->qdisc_sleeping != &noop_qdisc) + return false; + } + return true; +} + +static void attach_one_default_qdisc(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_unused) +{ + struct Qdisc *qdisc; + + if (dev->tx_queue_len) { + qdisc = qdisc_create_dflt(dev, dev_queue, + &pfifo_fast_ops, TC_H_ROOT); + if (!qdisc) { + printk(KERN_INFO "%s: activation failed\n", dev->name); + return; + } + } else { + qdisc = &noqueue_qdisc; + } + dev_queue->qdisc_sleeping = qdisc; +} + +static void transition_one_qdisc(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_need_watchdog) +{ + struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping; + int *need_watchdog_p = _need_watchdog; + + if (!(new_qdisc->flags & TCQ_F_BUILTIN)) + clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); + + rcu_assign_pointer(dev_queue->qdisc, new_qdisc); + if (need_watchdog_p && new_qdisc != &noqueue_qdisc) + *need_watchdog_p = 1; +} + +void dev_activate(struct net_device *dev) +{ + int need_watchdog; + + /* No queueing discipline is attached to device; + create default one i.e. pfifo_fast for devices, + which need queueing and noqueue_qdisc for + virtual interfaces + */ + + if (dev_all_qdisc_sleeping_noop(dev)) + netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); + + if (!netif_carrier_ok(dev)) + /* Delay activation until next carrier-on event */ + return; + + need_watchdog = 0; + netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); + transition_one_qdisc(dev, &dev->rx_queue, NULL); + + if (need_watchdog) { + dev->trans_start = jiffies; + dev_watchdog_up(dev); + } +} + +static void dev_deactivate_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_qdisc_default) +{ + struct Qdisc *qdisc_default = _qdisc_default; + struct Qdisc *qdisc; + + qdisc = dev_queue->qdisc; + if (qdisc) { + spin_lock_bh(qdisc_lock(qdisc)); + + if (!(qdisc->flags & TCQ_F_BUILTIN)) + set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); + + rcu_assign_pointer(dev_queue->qdisc, qdisc_default); + qdisc_reset(qdisc); + + spin_unlock_bh(qdisc_lock(qdisc)); + } +} + +static bool some_qdisc_is_busy(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *dev_queue; + spinlock_t *root_lock; + struct Qdisc *q; + int val; + + dev_queue = netdev_get_tx_queue(dev, i); + q = dev_queue->qdisc_sleeping; + root_lock = qdisc_lock(q); + + spin_lock_bh(root_lock); + + val = (test_bit(__QDISC_STATE_RUNNING, &q->state) || + test_bit(__QDISC_STATE_SCHED, &q->state)); + + spin_unlock_bh(root_lock); + + if (val) + return true; + } + return false; +} + +void dev_deactivate(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc); + dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc); + + dev_watchdog_down(dev); + +#ifndef DDE_LINUX + /* Wait for outstanding qdisc-less dev_queue_xmit calls. */ + synchronize_rcu(); +#endif + + /* Wait for outstanding qdisc_run calls. */ + while (some_qdisc_is_busy(dev)) + yield(); +} + +static void dev_init_scheduler_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_qdisc) +{ + struct Qdisc *qdisc = _qdisc; + + dev_queue->qdisc = qdisc; + dev_queue->qdisc_sleeping = qdisc; +} + +void dev_init_scheduler(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); + dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc); + + setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev); +} + +static void shutdown_scheduler_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_qdisc_default) +{ + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + struct Qdisc *qdisc_default = _qdisc_default; + + if (qdisc) { + rcu_assign_pointer(dev_queue->qdisc, qdisc_default); + dev_queue->qdisc_sleeping = qdisc_default; + + qdisc_destroy(qdisc); + } +} + +void dev_shutdown(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); + shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc); + WARN_ON(timer_pending(&dev->watchdog_timer)); +} |