diff options
Diffstat (limited to 'libdde_linux26/lib')
131 files changed, 0 insertions, 60342 deletions
diff --git a/libdde_linux26/lib/.svn/all-wcprops b/libdde_linux26/lib/.svn/all-wcprops deleted file mode 100644 index ea9df250..00000000 --- a/libdde_linux26/lib/.svn/all-wcprops +++ /dev/null @@ -1,17 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 54 -/repos/tudos/!svn/ver/457/trunk/l4/pkg/dde/linux26/lib -END -Makefile -K 25 -svn:wc:ra_dav:version-url -V 63 -/repos/tudos/!svn/ver/322/trunk/l4/pkg/dde/linux26/lib/Makefile -END -README -K 25 -svn:wc:ra_dav:version-url -V 61 -/repos/tudos/!svn/ver/174/trunk/l4/pkg/dde/linux26/lib/README -END diff --git a/libdde_linux26/lib/.svn/entries b/libdde_linux26/lib/.svn/entries deleted file mode 100644 index ded8ca64..00000000 --- a/libdde_linux26/lib/.svn/entries +++ /dev/null @@ -1,102 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib -http://svn.tudos.org/repos/tudos - - - -2009-05-23T02:50:17.774710Z -457 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -src_ip -dir - -src -dir - -Makefile -file - - - - -2009-11-15T17:17:13.000000Z -6e4b787b1a15fafe5e8c6ec03151b863 -2008-03-18T03:51:56.301196Z -322 -l4check - - - - - - - - - - - - - - - - - - - - - -188 - -README -file - - - - -2009-11-15T17:17:13.000000Z -fb1943fc687297e5d57d507e6c75fcb0 -2007-09-08T19:44:13.897747Z -174 -l4check - - - - - - - - - - - - - - - - - - - - - -103 - diff --git a/libdde_linux26/lib/.svn/format b/libdde_linux26/lib/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/.svn/text-base/Makefile.svn-base b/libdde_linux26/lib/.svn/text-base/Makefile.svn-base deleted file mode 100644 index 0ea410d0..00000000 --- a/libdde_linux26/lib/.svn/text-base/Makefile.svn-base +++ /dev/null @@ -1,11 +0,0 @@ -PKGDIR ?= ../.. -L4DIR ?= $(PKGDIR)/../.. - -include $(L4DIR)/mk/Makeconf --include $(PKGDIR_OBJ)/Makeconf - -ifeq ($(CONFIG_DDE26),y) -TARGET = src src_ip -endif - -include $(L4DIR)/mk/subdir.mk diff --git a/libdde_linux26/lib/.svn/text-base/README.svn-base b/libdde_linux26/lib/.svn/text-base/README.svn-base deleted file mode 100644 index ed1e88de..00000000 --- a/libdde_linux26/lib/.svn/text-base/README.svn-base +++ /dev/null @@ -1,5 +0,0 @@ -These are the DDE libraries. - -* src - main DDELinux lib -* net - network support for DDELinux -[* sound] diff --git a/libdde_linux26/lib/src/.svn/all-wcprops b/libdde_linux26/lib/src/.svn/all-wcprops deleted file mode 100644 index 64a1cdea..00000000 --- a/libdde_linux26/lib/src/.svn/all-wcprops +++ /dev/null @@ -1,11 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 58 -/repos/tudos/!svn/ver/457/trunk/l4/pkg/dde/linux26/lib/src -END -Makefile -K 25 -svn:wc:ra_dav:version-url -V 67 -/repos/tudos/!svn/ver/457/trunk/l4/pkg/dde/linux26/lib/src/Makefile -END diff --git a/libdde_linux26/lib/src/.svn/entries b/libdde_linux26/lib/src/.svn/entries deleted file mode 100644 index 5308d838..00000000 --- a/libdde_linux26/lib/src/.svn/entries +++ /dev/null @@ -1,89 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src -http://svn.tudos.org/repos/tudos - - - -2009-05-23T02:50:17.774710Z -457 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -kernel -dir - -lib -dir - -net -dir - -security -dir - -fs -dir - -mm -dir - -block -dir - -arch -dir - -Makefile -file - - - - -2009-11-15T17:17:13.000000Z -c0f809fbe042ad3390c63a64b7c16694 -2009-05-23T02:50:17.774710Z -457 -l4check - - - - - - - - - - - - - - - - - - - - - -7984 - -drivers -dir - diff --git a/libdde_linux26/lib/src/.svn/format b/libdde_linux26/lib/src/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/.svn/text-base/Makefile.svn-base b/libdde_linux26/lib/src/.svn/text-base/Makefile.svn-base deleted file mode 100644 index 3707b2cb..00000000 --- a/libdde_linux26/lib/src/.svn/text-base/Makefile.svn-base +++ /dev/null @@ -1,230 +0,0 @@ -PKGDIR ?= ../../.. -L4DIR ?= $(PKGDIR)/../.. -CONTRIB ?= $(PKGDIR)/linux26/contrib - --include $(PKGDIR_OBJ)/Makeconf - -ifeq ($(CONFIG_DDE26_COMMON),y) -TARGET += libdde_linux26.o.a -endif - -ifeq ($(CONFIG_DDE26_NET),y) -TARGET += libdde_linux26_net.a -endif - -ifeq ($(CONFIG_DDE26_BLOCK),y) -TARGET += libdde_linux26_block.a -endif - -ifeq ($(CONFIG_DDE26_SOUND),y) -TARGET += libdde_linux26_sound.a -endif - -ifeq ($(CONFIG_DDE26_CHAR),y) -TARGET += libdde_linux26_char.a -endif - -SYSTEMS = x86-l4v2 - -ifeq ($(ARCH), x86) -ARCH_DIR = arch/x86 -endif - -ifeq ($(ARCH), arm) -ARCH_DIR = arch/arm -MARCH = realview -DEFINES += -D__LINUX_ARM_ARCH__=6 -endif - -# contrib sources are in $(CONTRIB) -vpath %.c $(CONTRIB) -vpath %.S $(CONTRIB) - -PRIVATE_INCDIR += $(CONTRIB)/drivers/pci $(PKGDIR)/linux26/lib/src/arch/l4 \ - $(CONTRIB)/$(ARCH_DIR)/pci $(CONTRIB)/drivers/base/ \ - $(CONTRIB)/lib $(PKGDIR_OBJ) $(CONTRIB)/net/core $(CONTRIB)/fs - -################################################################## -# Sources for libdde_linux.a # -################################################################## -SRC_DDE = cli_sti.c fs.c hw-helpers.c init_task.c init.c pci.c power.c \ - process.c res.c sched.c signal.c smp.c softirq.c timer.c \ - page_alloc.c kmem_cache.c kmalloc.c irq.c param.c \ - vmalloc.c vmstat.c mm-helper.c - -# our implementation -SRC_C_libdde_linux26.o.a = $(addprefix arch/l4/, $(SRC_DDE)) - -ifeq ($(ARCH), x86) -SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/semaphore_32.S -SRC_C_libdde_linux26.o.a += lib/rwsem.c -SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/kernel/pci-dma.c -SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/kernel/pci-nommu.c -SRC_S_libdde_linux26_net.a += $(ARCH_DIR)/lib/checksum_32.S -endif - -ifeq ($(ARCH), arm) -SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/changebit.S -SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/clearbit.S -SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/div64.S -SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/findbit.S -SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/memzero.S -SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/setbit.S -SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/testclearbit.S -SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/testchangebit.S -SRC_S_libdde_linux26.o.a += $(ARCH_DIR)/lib/testsetbit.S -SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/kernel/semaphore.c -SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/kernel/traps.c -SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/mach-$(MARCH)/clock.c -SRC_C_libdde_linux26.o.a += $(ARCH_DIR)/mach-$(MARCH)/realview_eb.c -SRC_C_libdde_linux26.o.a += lib/rwsem-spinlock.c -SRC_C_libdde_linux26.o.a += drivers/amba/bus.c -endif - -# + contrib stuff / slightly modified stuff -SRC_C_libdde_linux26.o.a += \ - kernel/exit.c \ - kernel/kthread.c \ - kernel/mutex.c \ - kernel/notifier.c \ - kernel/resource.c \ - kernel/rwsem.c \ - kernel/sched.c \ - kernel/semaphore.c \ - kernel/sys.c \ - kernel/time.c \ - kernel/timer.c \ - kernel/wait.c \ - kernel/workqueue.c \ - lib/bitmap.c \ - lib/bitrev.c \ - lib/crc32.c \ - lib/ctype.c \ - lib/cpumask.c \ - lib/find_next_bit.c \ - lib/hexdump.c \ - lib/idr.c \ - lib/iomap.c \ - lib/hweight.c \ - lib/kasprintf.c \ - lib/kernel_lock.c \ - lib/klist.c \ - lib/kobject.c \ - lib/kref.c \ - lib/parser.c \ - lib/proportions.c \ - lib/radix-tree.c \ - lib/scatterlist.c \ - lib/sha1.c \ - lib/string.c \ - lib/vsprintf.c \ - mm/dmapool.c \ - mm/mempool.c \ - mm/swap.c \ - mm/util.c \ - drivers/base/attribute_container.c \ - drivers/base/bus.c \ - drivers/base/class.c \ - drivers/base/core.c \ - drivers/base/cpu.c \ - drivers/base/dd.c \ - drivers/base/devres.c \ - drivers/base/driver.c \ - drivers/base/init.c \ - drivers/base/map.c \ - drivers/base/platform.c \ - drivers/base/sys.c \ - drivers/pci/access.c \ - drivers/pci/bus.c \ - drivers/pci/hotplug-pci.c \ - drivers/pci/pci.c \ - drivers/pci/pci-driver.c \ - drivers/pci/probe.c \ - drivers/pci/search.c \ - drivers/pci/setup-bus.c \ - drivers/pci/setup-res.c - -################################################################## -# Sources for libdde_linux_net.a # -################################################################## -SRC_C_libdde_linux26_net.a += \ - arch/l4/net.c \ - drivers/net/mii.c \ - net/core/dev.c \ - net/core/dev_mcast.c \ - net/core/ethtool.c \ - net/core/link_watch.c \ - net/core/neighbour.c \ - net/core/netevent.c \ - net/core/net-sysfs.c \ - net/core/net_namespace.c \ - net/core/rtnetlink.c \ - net/core/skbuff.c \ - net/core/skb_dma_map.c \ - net/core/utils.c \ - net/ethernet/eth.c \ - net/sched/sch_generic.c - -################################################################## -# Sources for libdde_linux_sound.a # -################################################################## -SRC_C_libdde_linux26_sound.a += \ - sound/sound_core.c \ - arch/l4/sound.c - -################################################################## -# Sources for libdde_linux_block.a # -################################################################## -# -SRC_C_libdde_linux26_block.a += \ - arch/l4/inodes.c \ - block/blk-barrier.c \ - block/blk-core.c \ - block/blk-exec.c \ - block/blk-ioc.c \ - block/blk-merge.c \ - block/blk-settings.c \ - block/blk-softirq.c \ - block/blk-sysfs.c \ - block/blk-tag.c \ - block/blk-timeout.c \ - block/elevator.c \ - block/genhd.c \ - block/noop-iosched.c \ - block/ioctl.c \ - block/scsi_ioctl.c \ - mm/backing-dev.c \ - mm/bounce.c \ - mm/page-writeback.c \ - fs/bio.c \ - fs/block_dev.c \ - fs/buffer.c \ - fs/filesystems.c -################################################################## -# Sources for libdde_linux_char.a # -################################################################## -SRC_C_libdde_linux26_char.a += \ - arch/l4/inodes.c \ - fs/char_dev.c - -all:: -lib/crc32.o : crc32table.h -lib/crc32.o : PRIVATE_INCDIR += . -kernel/time.o : timeconst.h -kernel/time.o : PRIVATE_INCDIR += . - -timeconst.h : $(SRC_DIR)/kernel/timeconst.pl - @$(GEN_MESSAGE) - $(VERBOSE)$< 250 >$@ - -crc32table.h : gen_crc32table - @$(GEN_MESSAGE) - $(VERBOSE)./$< >$@ - -gen_crc32table : lib/gen_crc32table.c - @$(GEN_MESSAGE) - $(VERBOSE)$(HOST_CC) -O2 -o $@ $< - -include $(PKGDIR)/linux26/Makeconf - -include $(L4DIR)/mk/lib.mk diff --git a/libdde_linux26/lib/src/arch/.svn/all-wcprops b/libdde_linux26/lib/src/arch/.svn/all-wcprops deleted file mode 100644 index 775206bc..00000000 --- a/libdde_linux26/lib/src/arch/.svn/all-wcprops +++ /dev/null @@ -1,5 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 63 -/repos/tudos/!svn/ver/457/trunk/l4/pkg/dde/linux26/lib/src/arch -END diff --git a/libdde_linux26/lib/src/arch/.svn/entries b/libdde_linux26/lib/src/arch/.svn/entries deleted file mode 100644 index b075adca..00000000 --- a/libdde_linux26/lib/src/arch/.svn/entries +++ /dev/null @@ -1,34 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/arch -http://svn.tudos.org/repos/tudos - - - -2009-05-23T02:50:17.774710Z -457 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -l4 -dir - -x86 -dir - diff --git a/libdde_linux26/lib/src/arch/.svn/format b/libdde_linux26/lib/src/arch/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/arch/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/arch/l4/.svn/all-wcprops b/libdde_linux26/lib/src/arch/l4/.svn/all-wcprops deleted file mode 100644 index 132337d1..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/all-wcprops +++ /dev/null @@ -1,155 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 66 -/repos/tudos/!svn/ver/457/trunk/l4/pkg/dde/linux26/lib/src/arch/l4 -END -local.h -K 25 -svn:wc:ra_dav:version-url -V 74 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/local.h -END -smp.c -K 25 -svn:wc:ra_dav:version-url -V 72 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/smp.c -END -param.c -K 25 -svn:wc:ra_dav:version-url -V 74 -/repos/tudos/!svn/ver/240/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/param.c -END -init.c -K 25 -svn:wc:ra_dav:version-url -V 73 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/init.c -END -fs.c -K 25 -svn:wc:ra_dav:version-url -V 71 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/fs.c -END -pci.c -K 25 -svn:wc:ra_dav:version-url -V 72 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/pci.c -END -kmem_cache.c -K 25 -svn:wc:ra_dav:version-url -V 79 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/kmem_cache.c -END -signal.c -K 25 -svn:wc:ra_dav:version-url -V 75 -/repos/tudos/!svn/ver/174/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/signal.c -END -hw-helpers.c -K 25 -svn:wc:ra_dav:version-url -V 79 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/hw-helpers.c -END -process.c -K 25 -svn:wc:ra_dav:version-url -V 76 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/process.c -END -vmstat.c -K 25 -svn:wc:ra_dav:version-url -V 75 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/vmstat.c -END -timer.c -K 25 -svn:wc:ra_dav:version-url -V 74 -/repos/tudos/!svn/ver/457/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/timer.c -END -inodes.c -K 25 -svn:wc:ra_dav:version-url -V 75 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/inodes.c -END -kmalloc.c -K 25 -svn:wc:ra_dav:version-url -V 76 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/kmalloc.c -END -init_task.c -K 25 -svn:wc:ra_dav:version-url -V 78 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/init_task.c -END -page_alloc.c -K 25 -svn:wc:ra_dav:version-url -V 79 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/page_alloc.c -END -net.c -K 25 -svn:wc:ra_dav:version-url -V 72 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/net.c -END -mm-helper.c -K 25 -svn:wc:ra_dav:version-url -V 78 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/mm-helper.c -END -cli_sti.c -K 25 -svn:wc:ra_dav:version-url -V 76 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/cli_sti.c -END -sched.c -K 25 -svn:wc:ra_dav:version-url -V 74 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/sched.c -END -softirq.c -K 25 -svn:wc:ra_dav:version-url -V 76 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/softirq.c -END -res.c -K 25 -svn:wc:ra_dav:version-url -V 72 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/res.c -END -irq.c -K 25 -svn:wc:ra_dav:version-url -V 72 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/irq.c -END -power.c -K 25 -svn:wc:ra_dav:version-url -V 74 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/power.c -END -vmalloc.c -K 25 -svn:wc:ra_dav:version-url -V 76 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/l4/vmalloc.c -END diff --git a/libdde_linux26/lib/src/arch/l4/.svn/entries b/libdde_linux26/lib/src/arch/l4/.svn/entries deleted file mode 100644 index a4e08f01..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/entries +++ /dev/null @@ -1,878 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/arch/l4 -http://svn.tudos.org/repos/tudos - - - -2009-05-23T02:50:17.774710Z -457 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -local.h -file - - - - -2009-11-15T17:17:12.000000Z -29e4e373a5332517fa8d3a54b63a934c -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -3155 - -smp.c -file - - - - -2009-11-15T17:17:12.000000Z -0baa40739f76a596efe1d2ef99768f59 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -1294 - -param.c -file - - - - -2009-11-15T17:17:12.000000Z -8b9465dfae207ca0ce9548228914b19f -2007-11-27T03:55:44.347675Z -240 -l4check - - - - - - - - - - - - - - - - - - - - - -1039 - -init.c -file - - - - -2009-11-15T17:17:12.000000Z -c59f682047dbf216f271ba4fb8f962ef -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -713 - -fs.c -file - - - - -2009-11-15T17:17:12.000000Z -361eeaac7be43ebe64478812bdf3808a -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -2353 - -pci.c -file - - - - -2009-11-15T17:17:12.000000Z -670757aeca81d5fdbcce07c45902b2f4 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -4732 - -kmem_cache.c -file - - - - -2009-11-15T17:17:12.000000Z -1d7b5540f6113539b83f4688eb5a320d -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -5429 - -signal.c -file - - - - -2009-11-15T17:17:12.000000Z -1c1133b1a3dcf504174b892eba60986a -2007-09-08T19:44:13.897747Z -174 -l4check - - - - - - - - - - - - - - - - - - - - - -875 - -hw-helpers.c -file - - - - -2009-11-15T17:17:12.000000Z -7900d76d82fab85c74b0d8baec1baac5 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -170 - -process.c -file - - - - -2009-11-15T17:17:12.000000Z -e256217d715b25cf56bb937480c82376 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -9859 - -vmstat.c -file - - - - -2009-11-15T17:17:12.000000Z -3f706a9a494cf0bfd99facee793dd0d5 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -608 - -timer.c -file - - - - -2009-11-15T17:17:12.000000Z -f13640bc9b9d4520e7b8ec09d3b9e452 -2009-05-23T02:50:17.774710Z -457 -l4check - - - - - - - - - - - - - - - - - - - - - -3669 - -inodes.c -file - - - - -2009-11-15T17:17:12.000000Z -8f1f06ea530105b7b1b1b24fd0cb5d00 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -6687 - -kmalloc.c -file - - - - -2009-11-15T17:17:12.000000Z -2fd70eccfddd108357815aa4bb031354 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -4327 - -init_task.c -file - - - - -2009-11-15T17:17:12.000000Z -fc20d990328c12f596b836e722781f63 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -3360 - -page_alloc.c -file - - - - -2009-11-15T17:17:12.000000Z -584a5941cdf1efe2435718993a980ba1 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -6768 - -net.c -file - - - - -2009-11-15T17:17:12.000000Z -2adc371a98f0fd8fae363e0f70854314 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -1307 - -mm-helper.c -file - - - - -2009-11-15T17:17:12.000000Z -4b1d4ac41bb0a6ffb8c9bbe6f1ab95b1 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -880 - -cli_sti.c -file - - - - -2009-11-15T17:17:12.000000Z -8732f061e7ff7d24c42fb8dc9aec718f -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -1284 - -sched.c -file - - - - -2009-11-15T17:17:12.000000Z -904a23f9a1efa20b904b9294e5c3fe43 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -3206 - -softirq.c -file - - - - -2009-11-15T17:17:12.000000Z -7564de83f9ac6f983ff7e8346c78bba8 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -5778 - -res.c -file - - - - -2009-11-15T17:17:12.000000Z -9190bb75985ff0ee2135c7ae47a7800d -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -4275 - -irq.c -file - - - - -2009-11-15T17:17:12.000000Z -19e10dde42bbbe5176338d96ae8553ba -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -5607 - -power.c -file - - - - -2009-11-15T17:17:12.000000Z -c108e3ad0b0c0c68015fed4e159c1b53 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -642 - -vmalloc.c -file - - - - -2009-11-15T17:17:12.000000Z -73cceaf52046b2f0d152ad8cfde1685f -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -919 - diff --git a/libdde_linux26/lib/src/arch/l4/.svn/format b/libdde_linux26/lib/src/arch/l4/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/cli_sti.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/cli_sti.c.svn-base deleted file mode 100644 index 81c4feea..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/cli_sti.c.svn-base +++ /dev/null @@ -1,66 +0,0 @@ -#include "local.h" - -#include <linux/kernel.h> - -/* IRQ lock reference counter */ -static atomic_t _refcnt = ATOMIC_INIT(0); - -/* Check whether IRQs are currently disabled. - * - * This is the case, if flags is greater than 0. - */ - -int raw_irqs_disabled_flags(unsigned long flags) -{ - return ((int)flags > 0); -} - -/* Store the current flags state. - * - * This is done by returning the current refcnt. - * - * XXX: Up to now, flags was always 0 at this point and - * I assume that this is always the case. Prove? - */ -unsigned long __raw_local_save_flags(void) -{ - return (unsigned long)atomic_read(&_refcnt); -} - -/* Restore IRQ state. */ -void raw_local_irq_restore(unsigned long flags) -{ - atomic_set(&_refcnt, flags); -} - -/* Disable IRQs by grabbing the IRQ lock. */ -void raw_local_irq_disable(void) -{ - atomic_inc(&_refcnt); -} - -/* Unlock the IRQ lock until refcnt is 0. */ -void raw_local_irq_enable(void) -{ - atomic_set(&_refcnt, 0); -} - - -void raw_safe_halt(void) -{ - WARN_UNIMPL; -} - - -void halt(void) -{ - WARN_UNIMPL; -} - -/* These functions are empty for DDE. Every DDE thread is a separate - * "virtual" CPU. Therefore there is no need to en/disable bottom halves. - */ -void local_bh_disable(void) {} -void __local_bh_enable(void) {} -void _local_bh_enable(void) {} -void local_bh_enable(void) {} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/fs.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/fs.c.svn-base deleted file mode 100644 index db452949..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/fs.c.svn-base +++ /dev/null @@ -1,111 +0,0 @@ -#include "local.h" - -#include <linux/fs.h> -#include <linux/backing-dev.h> -#include <linux/mount.h> - -/* - * Some subsystems, such as the blockdev layer, implement their data - * hierarchy as a pseudo file system. To not incorporate the complete - * Linux VFS implementation, we cut this down to an own one only for - * pseudo file systems. - */ -static LIST_HEAD(dde_vfs_mounts); - -#define MAX_RA_PAGES 1 - -void default_unplug_io_fn(struct backing_dev_info *bdi, struct page* p) -{ -} - -struct backing_dev_info default_backing_dev_info = { - .ra_pages = MAX_RA_PAGES, - .state = 0, - .capabilities = BDI_CAP_MAP_COPY, - .unplug_io_fn = default_unplug_io_fn, -}; - -int seq_printf(struct seq_file *m, const char *f, ...) -{ - WARN_UNIMPL; - return 0; -} - -int generic_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - WARN_UNIMPL; - return 0; -} - - -/************************************** - * Filemap stuff * - **************************************/ -struct page * find_get_page(struct address_space *mapping, unsigned long offset) -{ - WARN_UNIMPL; - return NULL; -} - -void unlock_page(struct page *page) -{ - WARN_UNIMPL; -} - -int test_set_page_writeback(struct page *page) -{ - WARN_UNIMPL; - return 0; -} - -void end_page_writeback(struct page *page) -{ - WARN_UNIMPL; -} - -void do_invalidatepage(struct page *page, unsigned long offset) -{ - WARN_UNIMPL; -} - -int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) -{ - WARN_UNIMPL; - return 0; -} - -static struct vfsmount *dde_kern_mount(struct file_system_type *type, - int flags, const char *name, - void *data) -{ - struct list_head *pos, *head; - int error; - - head = &dde_vfs_mounts; - __list_for_each(pos, head) { - struct vfsmount *mnt = list_entry(pos, struct vfsmount, next); - if (strcmp(name, mnt->name) == 0) { - printk("FS type %s already mounted!?\n", name); - BUG(); - return NULL; - } - } - - struct vfsmount *m = kzalloc(sizeof(*m), GFP_KERNEL); - m->fs_type = type; - m->name = kmalloc(strlen(name) + 1, GFP_KERNEL); - memcpy(m->name, name, strlen(name) + 1); - - error = type->get_sb(type, flags, name, data, m); - BUG_ON(error); - - list_add_tail(&m->next, &dde_vfs_mounts); - - return m; -} - -struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) -{ - return dde_kern_mount(type, 0, type->name, NULL); -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/hw-helpers.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/hw-helpers.c.svn-base deleted file mode 100644 index 555406c9..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/hw-helpers.c.svn-base +++ /dev/null @@ -1,12 +0,0 @@ -#include "local.h" - -#include <linux/kexec.h> - -note_buf_t *crash_notes = NULL; - -void touch_nmi_watchdog(void) -{ - WARN_UNIMPL; -} - -unsigned long pci_mem_start = 0xABCDABCD; diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/init.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/init.c.svn-base deleted file mode 100644 index e89ef27f..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/init.c.svn-base +++ /dev/null @@ -1,33 +0,0 @@ -#include "local.h" - -#include <l4/dde/linux26/dde26.h> -#include <l4/dde/dde.h> - -#define DEBUG_PCI(msg, ...) ddekit_printf( "\033[33m"msg"\033[0m\n", ##__VA_ARGS__) - -/* Didn't know where to put this. */ -unsigned long __per_cpu_offset[NR_CPUS]; - -extern void driver_init(void); -extern int classes_init(void); - -void __init __attribute__((used)) l4dde26_init(void) -{ - /* first, initialize DDEKit */ - ddekit_init(); - - l4dde26_kmalloc_init(); - - /* Init Linux driver framework before trying to add PCI devs to the bus */ - driver_init(); - - printk("Initialized DDELinux 2.6\n"); -} - -void l4dde26_do_initcalls(void) -{ - /* finally, let DDEKit perform all the initcalls */ - ddekit_do_initcalls(); -} - -dde_initcall(l4dde26_init); diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/init_task.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/init_task.c.svn-base deleted file mode 100644 index 685373d1..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/init_task.c.svn-base +++ /dev/null @@ -1,131 +0,0 @@ -#include "local.h" - -//#include <asm/desc.h> -#include <asm/pgtable.h> -#include <asm/uaccess.h> - -#include <linux/fs.h> -#include <linux/fdtable.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/init_task.h> -#include <linux/ipc_namespace.h> -#include <linux/kernel.h> -#include <linux/mqueue.h> -#include <linux/module.h> -#include <linux/personality.h> - -/* init task */ -struct task_struct init_task; - -/* From kernel/pid.c */ -#define BITS_PER_PAGE (PAGE_SIZE*8) -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) - -/* From init/main.c */ -enum system_states system_state; -EXPORT_SYMBOL(system_state); - -struct fs_struct init_fs = { - .count = ATOMIC_INIT(1), - .lock = __RW_LOCK_UNLOCKED(init_fs.lock), - .umask = 0022, -}; - -struct files_struct init_files = { - .count = ATOMIC_INIT(1), - .fdt = &init_files.fdtab, - .fdtab = { - .max_fds = NR_OPEN_DEFAULT, - .fd = &init_files.fd_array[0], - .close_on_exec = (fd_set *)&init_files.close_on_exec_init, - .open_fds = (fd_set *)&init_files.open_fds_init, - .rcu = RCU_HEAD_INIT, - }, - .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), -}; - -struct signal_struct init_signals = INIT_SIGNALS(init_signals); -struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); -pgd_t swapper_pg_dir[PTRS_PER_PGD]; -union thread_union init_thread_union = { INIT_THREAD_INFO(init_task) }; -struct group_info init_groups = {.usage = ATOMIC_INIT(2)}; - -struct user_struct root_user = { - .__count = ATOMIC_INIT(1), - .processes = ATOMIC_INIT(1), - .files = ATOMIC_INIT(0), - .sigpending = ATOMIC_INIT(0), - .mq_bytes = 0, - .locked_shm = 0, -}; - -/* - * PID-map pages start out as NULL, they get allocated upon - * first use and are never deallocated. This way a low pid_max - * value does not cause lots of bitmaps to be allocated, but - * the scheme scales to up to 4 million PIDs, runtime. - */ -struct pid_namespace init_pid_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, - .pidmap = { - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } - }, - .last_pid = 0, - .level = 0, - .child_reaper = &init_task, -}; -EXPORT_SYMBOL_GPL(init_pid_ns); - -struct net init_net __attribute__((weak)); - -struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); - -struct ipc_namespace init_ipc_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, -}; - -struct user_namespace init_user_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, -}; - - -struct uts_namespace init_uts_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, - .name = { - .sysname = "L4/DDE", - .nodename = "", - .release = "2.6", - .version = "25", - .machine = "", - .domainname = "", - }, -}; - -struct exec_domain default_exec_domain = { - .name = "Linux", /* name */ - .handler = NULL, /* no signaling! */ - .pers_low = 0, /* PER_LINUX personality. */ - .pers_high = 0, /* PER_LINUX personality. */ - .signal_map = 0, /* Identity map signals. */ - .signal_invmap = 0, /* - both ways. */ -}; - -/* copy of the initial task struct */ -struct task_struct init_task = INIT_TASK(init_task); -/* copy of the initial thread info (which contains init_task) */ -struct thread_info init_thread = INIT_THREAD_INFO(init_task); - -long do_no_restart_syscall(struct restart_block *param) -{ - return -EINTR; -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/inodes.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/inodes.c.svn-base deleted file mode 100644 index 9ef02ed5..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/inodes.c.svn-base +++ /dev/null @@ -1,311 +0,0 @@ -/** lib/src/arch/l4/inodes.c - * - * Assorted dummies implementing inode and superblock access functions, - * which are used by the block layer stuff, but not needed in DDE_Linux. - */ - -#include "local.h" - -#include <linux/fs.h> -#include <linux/module.h> -#include <linux/mount.h> - -/* - * Linux' global list of all super blocks. - */ -LIST_HEAD(super_blocks); - -/********************************** - * Inode stuff * - **********************************/ - -struct inode* new_inode(struct super_block *sb) -{ - if (sb->s_op->alloc_inode) - return sb->s_op->alloc_inode(sb); - - return kzalloc(sizeof(struct inode), GFP_KERNEL); -} - -void __mark_inode_dirty(struct inode *inode, int flags) -{ - WARN_UNIMPL; -} - -void iput(struct inode *inode) -{ - WARN_UNIMPL; -} - -void generic_delete_inode(struct inode *inode) -{ - WARN_UNIMPL; -} - -int invalidate_inodes(struct super_block * sb) -{ - WARN_UNIMPL; - return 0; -} - -void truncate_inode_pages(struct address_space *mapping, loff_t lstart) -{ - WARN_UNIMPL; -} - -void touch_atime(struct vfsmount *mnt, struct dentry *dentry) -{ - WARN_UNIMPL; -} - -/********************************** - * Superblock stuff * - **********************************/ - -struct super_block * get_super(struct block_device *bdev) -{ - WARN_UNIMPL; - return NULL; -} - -int simple_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - WARN_UNIMPL; - return 0; -} - -void kill_anon_super(struct super_block *sb) -{ - WARN_UNIMPL; -} - -void shrink_dcache_sb(struct super_block * sb) -{ - WARN_UNIMPL; -} - -void drop_super(struct super_block *sb) -{ - WARN_UNIMPL; -} - -struct inode_operations empty_iops = { }; -struct file_operations empty_fops = { }; - -/**! Alloc and init a new inode. - * - * Basically stolen from linux/fs/inode.c:alloc_inode() - */ -static struct inode *dde_alloc_inode(struct super_block *sb) -{ - struct inode *inode; - - if (sb->s_op->alloc_inode) - inode = sb->s_op->alloc_inode(sb); - else - inode = kzalloc(sizeof(*inode), GFP_KERNEL); - - if (inode) { - inode->i_sb = sb; - inode->i_blkbits = sb->s_blocksize_bits; - inode->i_flags = 0; - atomic_set(&inode->i_count, 1); - inode->i_op = &empty_iops; - inode->i_fop = &empty_fops; - inode->i_nlink = 1; - atomic_set(&inode->i_writecount, 0); - inode->i_size = 0; - inode->i_blocks = 0; - inode->i_bytes = 0; - inode->i_generation = 0; - inode->i_pipe = NULL; - inode->i_bdev = NULL; - inode->i_cdev = NULL; - inode->i_rdev = 0; - inode->dirtied_when = 0; - inode->i_private = NULL; - } - - return inode; -} - - -void __iget(struct inode *inode) -{ - atomic_inc(&inode->i_count); -} - - -static struct inode *dde_new_inode(struct super_block *sb, struct list_head *head, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *data) -{ - struct inode *ret = dde_alloc_inode(sb); - int err = 0; - - if (set) - err = set(ret, data); - - BUG_ON(err); - - __iget(ret); - ret->i_state = I_LOCK|I_NEW; - - list_add_tail(&ret->i_sb_list, &sb->s_inodes); - - return ret; -} - - -struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *data) -{ - struct inode *inode = NULL; - struct list_head *p; - - list_for_each(p, &sb->s_inodes) { - struct inode *i = list_entry(p, struct inode, i_sb_list); - if (test) { - if (!test(i, data)) { - DEBUG_MSG("test false"); - continue; - } - else { - inode = i; - break; - } - } - } - - if (inode) - return inode; - - return dde_new_inode(sb, &sb->s_inodes, test, set, data); -} - -void unlock_new_inode(struct inode *inode) -{ - inode->i_state &= ~(I_LOCK | I_NEW); - wake_up_bit(&inode->i_state, __I_LOCK); -} - -struct super_block *sget(struct file_system_type *type, - int (*test)(struct super_block *, void*), - int (*set)(struct super_block *, void*), - void *data) -{ - struct super_block *s = NULL; - struct list_head *p; - int err; - - if (test) { - list_for_each(p, &type->fs_supers) { - struct super_block *block = list_entry(p, - struct super_block, - s_instances); - if (!test(block, data)) - continue; - return block; - } - } - - s = kzalloc(sizeof(*s), GFP_KERNEL); - BUG_ON(!s); - - INIT_LIST_HEAD(&s->s_dirty); - INIT_LIST_HEAD(&s->s_io); - INIT_LIST_HEAD(&s->s_files); - INIT_LIST_HEAD(&s->s_instances); - INIT_HLIST_HEAD(&s->s_anon); - INIT_LIST_HEAD(&s->s_inodes); - init_rwsem(&s->s_umount); - mutex_init(&s->s_lock); - lockdep_set_class(&s->s_umount, &type->s_umount_key); - /* - * The locking rules for s_lock are up to the - * filesystem. For example ext3fs has different - * lock ordering than usbfs: - */ - lockdep_set_class(&s->s_lock, &type->s_lock_key); - down_write(&s->s_umount); - s->s_count = S_BIAS; - atomic_set(&s->s_active, 1); - mutex_init(&s->s_vfs_rename_mutex); - mutex_init(&s->s_dquot.dqio_mutex); - mutex_init(&s->s_dquot.dqonoff_mutex); - init_rwsem(&s->s_dquot.dqptr_sem); - init_waitqueue_head(&s->s_wait_unfrozen); - s->s_maxbytes = MAX_NON_LFS; -#if 0 - s->dq_op = sb_dquot_ops; - s->s_qcop = sb_quotactl_ops; - s->s_op = &default_op; -#endif - s->s_time_gran = 1000000000; - - err = set(s, data); - BUG_ON(err); - - s->s_type = type; - strlcpy(s->s_id, type->name, sizeof(s->s_id)); - list_add_tail(&s->s_list, &super_blocks); - list_add(&s->s_instances, &type->fs_supers); - __module_get(type->owner); - return s; -} - -int set_anon_super(struct super_block *s, void *data) -{ - WARN_UNIMPL; - return 0; -} - -int get_sb_pseudo(struct file_system_type *fs_type, char *name, - const struct super_operations *ops, unsigned long magic, - struct vfsmount *mnt) -{ - struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); - struct super_operations default_ops = {}; - struct inode *root = NULL; - struct dentry *dentry = NULL; - struct qstr d_name = {.name = name, .len = strlen(name)}; - - BUG_ON(IS_ERR(s)); - - s->s_flags = MS_NOUSER; - s->s_maxbytes = ~0ULL; - s->s_blocksize = 1024; - s->s_blocksize_bits = 10; - s->s_magic = magic; - s->s_op = ops ? ops : &default_ops; - s->s_time_gran = 1; - root = new_inode(s); - - BUG_ON(!root); - - root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; - root->i_uid = root->i_gid = 0; -#if 0 - root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; - dentry = d_alloc(NULL, &d_name); - dentry->d_sb = s; - dentry->d_parent = dentry; - d_instantiate(dentry, root); -#endif - s->s_root = dentry; - s->s_flags |= MS_ACTIVE; - - mnt->mnt_sb = s; - mnt->mnt_root = dget(s->s_root); - - DEBUG_MSG("root mnt sb @ %p", mnt->mnt_sb); - - return 0; -} - -void inode_init_once(struct inode *inode) -{ - WARN_UNIMPL; -} - diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/irq.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/irq.c.svn-base deleted file mode 100644 index 0e565e54..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/irq.c.svn-base +++ /dev/null @@ -1,247 +0,0 @@ -/* - * \brief Hardware-interrupt support - * \author Christian Helmuth <ch12@os.inf.tu-dresden.de> - * \date 2007-02-12 - * - * - * - * XXX Consider support for IRQ_HANDLED and friends (linux/irqreturn.h) - */ - -/* Linux */ -#include <linux/interrupt.h> -#include <linux/string.h> /* memset() */ - -/* DDEKit */ -#include <l4/dde/ddekit/interrupt.h> -#include <l4/dde/ddekit/memory.h> - -/* local */ -#include "dde26.h" -#include "local.h" - -/* dummy */ -irq_cpustat_t irq_stat[CONFIG_NR_CPUS]; - -/** - * IRQ handling data - */ -static struct dde_irq -{ - unsigned irq; /* IRQ number */ - unsigned count; /* usage count */ - int shared; /* shared IRQ */ - struct ddekit_thread *thread; /* DDEKit interrupt thread */ - struct irqaction *action; /* Linux IRQ action */ - - struct dde_irq *next; /* next DDE IRQ */ -} *used_irqs; - - -static void irq_thread_init(void *p) { - l4dde26_process_add_worker(); } - - -extern ddekit_sem_t *dde_softirq_sem; -static void irq_handler(void *arg) -{ - struct dde_irq *irq = arg; - struct irqaction *action; - -#if 0 - DEBUG_MSG("irq 0x%x", irq->irq); -#endif - /* interrupt occurred - call all handlers */ - for (action = irq->action; action; action = action->next) { - irqreturn_t r = action->handler(action->irq, action->dev_id); -#if 0 - DEBUG_MSG("return: %s", r == IRQ_HANDLED ? "IRQ_HANDLED" : r == IRQ_NONE ? "IRQ_NONE" : "??"); -#endif - } - - /* upon return we check for pending soft irqs */ - if (local_softirq_pending()) - ddekit_sem_up(dde_softirq_sem); -} - - -/***************************** - ** IRQ handler bookkeeping ** - *****************************/ - -/** - * Claim IRQ - * - * \return usage counter or negative error code - * - * FIXME list locking - * FIXME are there more races? - */ -static int claim_irq(struct irqaction *action) -{ - int shared = action->flags & IRQF_SHARED ? 1 : 0; - struct dde_irq *irq; - - /* check if IRQ already used */ - for (irq = used_irqs; irq; irq = irq->next) - if (irq->irq == action->irq) break; - - /* we have to setup IRQ handling */ - if (!irq) { - /* allocate and initalize new descriptor */ - irq = ddekit_simple_malloc(sizeof(*irq)); - if (!irq) return -ENOMEM; - memset(irq, 0, sizeof(*irq)); - - irq->irq = action->irq; - irq->shared = shared; - irq->next = used_irqs; - used_irqs = irq; - - /* attach to interrupt */ - irq->thread = ddekit_interrupt_attach(irq->irq, - irq->shared, - irq_thread_init, - irq_handler, - (void *)irq); - if (!irq->thread) { - ddekit_simple_free(irq); - return -EBUSY; - } - } - - /* does desciptor allow our new handler? */ - if ((!irq->shared || !shared) && irq->action) return -EBUSY; - - /* add handler */ - irq->count++; - action->next = irq->action; - irq->action = action; - - return irq->count; -} - - -/** - * Free previously claimed IRQ - * - * \return usage counter or negative error code - */ -static struct irqaction *release_irq(unsigned irq_num, void *dev_id) -{ - struct dde_irq *prev_irq, *irq; - - /* check if IRQ already used */ - for (prev_irq = 0, irq = used_irqs; irq; - prev_irq = irq, irq = irq->next) - if (irq->irq == irq_num) break; - - if (!irq) return 0; - - struct irqaction *prev_action, *action; - - for (prev_action = 0, action = irq->action; action; - prev_action = action, action = action->next) - if (action->dev_id == dev_id) break; - - if (!action) return 0; - - /* dequeue action from irq */ - if (prev_action) - prev_action->next = action->next; - else - irq->action = action->next; - - /* dequeue irq from used_irqs list and free structure, - if no more actions available */ - if (!irq->action) { - if (prev_irq) - prev_irq->next = irq->next; - else - used_irqs = irq->next; - - /* detach from interrupt */ - ddekit_interrupt_detach(irq->irq); - - ddekit_simple_free(irq); - } - - return action; -} - - -/*************** - ** Linux API ** - ***************/ - -/** - * Request interrupt - * - * \param irq interrupt number - * \param handler interrupt handler -> top half - * \param flags interrupt handling flags (SA_SHIRQ, ...) - * \param dev_name device name - * \param dev_id cookie passed back to handler - * - * \return 0 on success; error code otherwise - * - * \todo FIXME consider locking! - */ -int request_irq(unsigned int irq, irq_handler_t handler, - unsigned long flags, const char *dev_name, void *dev_id) -{ - if (!handler) return -EINVAL; - - /* facilitate Linux irqaction for this handler */ - struct irqaction *irq_action = ddekit_simple_malloc(sizeof(*irq_action)); - if (!irq_action) return -ENOMEM; - memset(irq_action, 0, sizeof(*irq_action)); - - irq_action->handler = handler; - irq_action->flags = flags; - irq_action->name = dev_name; - irq_action->dev_id = dev_id; - irq_action->irq = irq; - - /* attach to IRQ */ - int err = claim_irq(irq_action); - if (err < 0) return err; - - return 0; -} - -/** Release Interrupt - * \ingroup mod_irq - * - * \param irq interrupt number - * \param dev_id cookie passed back to handler - * - */ -void free_irq(unsigned int irq, void *dev_id) -{ - struct irqaction *irq_action = release_irq(irq, dev_id); - - if (irq_action) - ddekit_simple_free(irq_action); -} - -void disable_irq(unsigned int irq) -{ - ddekit_interrupt_disable(irq); -} - -void disable_irq_nosync(unsigned int irq) -{ - /* - * Note: - * In contrast to the _nosync semantics, DDEKit's - * disable definitely waits until a currently executed - * IRQ handler terminates. - */ - ddekit_interrupt_disable(irq); -} - -void enable_irq(unsigned int irq) -{ - ddekit_interrupt_enable(irq); -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/kmalloc.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/kmalloc.c.svn-base deleted file mode 100644 index 065c13c7..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/kmalloc.c.svn-base +++ /dev/null @@ -1,199 +0,0 @@ -/* - * \brief kmalloc() implementation - * \author Christian Helmuth <ch12@os.inf.tu-dresden.de> - * \date 2007-01-24 - * - * In Linux 2.6 this resides in mm/slab.c. - * - * This implementation of kmalloc() stays with Linux's and uses kmem_caches for - * some power of two bytes. For larger allocations ddedkit_large_malloc() is - * used. This way, we optimize for speed and potentially waste memory - * resources. - */ - -/* Linux */ -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/bootmem.h> -#include <linux/module.h> -#include <linux/pci.h> -#include <linux/mm.h> -#include <asm/io.h> - -/* DDEKit */ -#include <l4/dde/ddekit/debug.h> -#include <l4/dde/ddekit/memory.h> - -#include <l4/dde/linux26/dde26.h> - -/* dummy */ -int forbid_dac; - -/* This stuff is needed by some drivers, e.g. for ethtool. - * XXX: This is a fake, implement it if you really need ethtool stuff. - */ -struct page* mem_map = NULL; -static bootmem_data_t contig_bootmem_data; -struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; - -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) -{ - return 0; -} -EXPORT_SYMBOL(remap_pfn_range); - -/******************* - ** Configuration ** - *******************/ - -#define DEBUG_MALLOC 0 - -/******************** - ** Implementation ** - ********************/ - -/* - * These are the default caches for kmalloc. Custom caches can have other sizes. - */ -static struct cache_sizes malloc_sizes[] = { -#define CACHE(x) { .cs_size = (x) }, -#include <linux/kmalloc_sizes.h> - CACHE(ULONG_MAX) -#undef CACHE -}; - - -/* - * kmalloc() cache names - */ -static const char *malloc_names[] = { -#define CACHE(x) "size-" #x, -#include <linux/kmalloc_sizes.h> - NULL -#undef CACHE -}; - - -/** - * Find kmalloc() cache for size - */ -static struct kmem_cache *find_cache(size_t size) -{ - struct cache_sizes *sizes; - - for (sizes = malloc_sizes; size > sizes->cs_size; ++sizes) ; - - return sizes->cs_cachep; -} - - -/** - * Free previously allocated memory - * @objp: pointer returned by kmalloc. - * - * If @objp is NULL, no operation is performed. - * - * Don't free memory not originally allocated by kmalloc() - * or you will run into trouble. - */ -void kfree(const void *objp) -{ - if (!objp) return; - - /* find cache back-pointer */ - void **p = (void **)objp - 1; - - ddekit_log(DEBUG_MALLOC, "objp=%p cache=%p (%d)", - p, *p, *p ? kmem_cache_size(*p) : 0); - - if (*p) - /* free from cache */ - kmem_cache_free(*p, p); - else - /* no cache for this size - use ddekit free */ - ddekit_large_free(p); -} - - -/** - * Allocate memory - * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * kmalloc is the normal method of allocating memory - * in the kernel. - */ -void *__kmalloc(size_t size, gfp_t flags) -{ - /* add space for back-pointer */ - size += sizeof(void *); - - /* find appropriate cache */ - struct kmem_cache *cache = find_cache(size); - - void **p; - if (cache) - /* allocate from cache */ - p = kmem_cache_alloc(cache, flags); - else - /* no cache for this size - use ddekit malloc */ - p = ddekit_large_malloc(size); - - ddekit_log(DEBUG_MALLOC, "size=%d, cache=%p (%d) => %p", - size, cache, cache ? kmem_cache_size(cache) : 0, p); - - /* return pointer to actual chunk */ - if (p) { - *p = cache; - p++; - } - return p; -} - - -size_t ksize(const void *p) -{ - struct kmem_cache *cache = (struct kmem_cache *)*((void**)p - 1); - if (cache) - return kmem_cache_size(cache); - return -1; -} - - -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) -{ - void *ret = (void *)__get_free_pages(flag, get_order(size)); - - if (ret != NULL) { - memset(ret, 0, size); - *dma_handle = virt_to_bus(ret); - } - return ret; -} - - -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) -{ - free_pages((unsigned long)vaddr, get_order(size)); -} - - -/******************** - ** Initialization ** - ********************/ - -/** - * dde_linux kmalloc initialization - */ -void l4dde26_kmalloc_init(void) -{ - struct cache_sizes *sizes = malloc_sizes; - const char **names = malloc_names; - - /* init malloc sizes array */ - for (; sizes->cs_size != ULONG_MAX; ++sizes, ++names) - sizes->cs_cachep = kmem_cache_create(*names, sizes->cs_size, 0, 0, 0); -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/kmem_cache.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/kmem_cache.c.svn-base deleted file mode 100644 index 1465ac6c..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/kmem_cache.c.svn-base +++ /dev/null @@ -1,213 +0,0 @@ -/* - * \brief Kmem_cache implementation - * \author Christian Helmuth - * \date 2007-01-22 - * - * In Linux 2.6 this resides in mm/slab.c. - * - * I'll disregard the following function currently... - * - * extern struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags); - * extern void *kmem_cache_zalloc(struct kmem_cache *, gfp_t); - */ - -/* Linux */ -#include <linux/slab.h> - -/* DDEKit */ -#include <l4/dde/ddekit/memory.h> -#include <l4/dde/ddekit/lock.h> - - -/******************* - ** Configuration ** - *******************/ - -#define DEBUG_SLAB 0 - -#if DEBUG_SLAB -# define DEBUG_SLAB_ALLOC 1 -#else -# define DEBUG_SLAB_ALLOC 0 -#endif - -/* - * Kmem cache structure - */ -struct kmem_cache -{ - const char *name; /**< cache name */ - unsigned size; /**< obj size */ - - struct ddekit_slab *ddekit_slab_cache; /**< backing DDEKit cache */ - ddekit_lock_t cache_lock; /**< lock */ - void (*ctor)(void *); /**< object constructor */ -}; - - -/** - * Return size of objects in cache - */ -unsigned int kmem_cache_size(struct kmem_cache *cache) -{ - return cache->size; -} - - -/** - * Return name of cache - */ -const char *kmem_cache_name(struct kmem_cache *cache) -{ - return cache->name; -} - - -/** - * kmem_cache_shrink - Shrink a cache. - * @cachep: The cache to shrink. - * - * Releases as many slabs as possible for a cache. - * To help debugging, a zero exit status indicates all slabs were released. - */ -int kmem_cache_shrink(struct kmem_cache *cache) -{ - /* noop */ - return 1; -} - - -/** - * kmem_cache_free - Deallocate an object - * @cachep: The cache the allocation was from. - * @objp: The previously allocated object. - * - * Free an object which was previously allocated from this - * cache. - */ -void kmem_cache_free(struct kmem_cache *cache, void *objp) -{ - ddekit_log(DEBUG_SLAB_ALLOC, "\"%s\" (%p)", cache->name, objp); - - ddekit_lock_lock(&cache->cache_lock); - ddekit_slab_free(cache->ddekit_slab_cache, objp); - ddekit_lock_unlock(&cache->cache_lock); -} - - -/** - * kmem_cache_alloc - Allocate an object - * @cachep: The cache to allocate from. - * @flags: See kmalloc(). - * - * Allocate an object from this cache. The flags are only relevant - * if the cache has no available objects. - */ -void *kmem_cache_alloc(struct kmem_cache *cache, gfp_t flags) -{ - void *ret; - - ddekit_log(DEBUG_SLAB_ALLOC, "\"%s\" flags=%x", cache->name, flags); - - ddekit_lock_lock(&cache->cache_lock); - ret = ddekit_slab_alloc(cache->ddekit_slab_cache); - ddekit_lock_unlock(&cache->cache_lock); - - // XXX: is it valid to run ctor AND memset to zero? - if (flags & __GFP_ZERO) - memset(ret, 0, cache->size); - else if (cache->ctor) - cache->ctor(ret); - - return ret; -} - - -/** - * kmem_cache_destroy - delete a cache - * @cachep: the cache to destroy - * - * Remove a struct kmem_cache object from the slab cache. - * Returns 0 on success. - * - * It is expected this function will be called by a module when it is - * unloaded. This will remove the cache completely, and avoid a duplicate - * cache being allocated each time a module is loaded and unloaded, if the - * module doesn't have persistent in-kernel storage across loads and unloads. - * - * The cache must be empty before calling this function. - * - * The caller must guarantee that noone will allocate memory from the cache - * during the kmem_cache_destroy(). - */ -void kmem_cache_destroy(struct kmem_cache *cache) -{ - ddekit_log(DEBUG_SLAB, "\"%s\"", cache->name); - - ddekit_slab_destroy(cache->ddekit_slab_cache); - ddekit_simple_free(cache); -} - - -/** - * kmem_cache_create - Create a cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. - * @flags: SLAB flags - * @ctor: A constructor for the objects. - * - * Returns a ptr to the cache on success, NULL on failure. - * Cannot be called within a int, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache - * and the @dtor is run before the pages are handed back. - * - * @name must be valid until the cache is destroyed. This implies that - * the module calling this has to destroy the cache before getting unloaded. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. - */ -struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, - void (*ctor)(void *)) -{ - ddekit_log(DEBUG_SLAB, "\"%s\" obj_size=%d", name, size); - - struct kmem_cache *cache; - - if (!name) { - printk("kmem_cache name reqeuired\n"); - return 0; - } - - cache = ddekit_simple_malloc(sizeof(*cache)); - if (!cache) { - printk("No memory for slab cache\n"); - return 0; - } - - /* Initialize a physically contiguous cache for kmem */ - if (!(cache->ddekit_slab_cache = ddekit_slab_init(size, 1))) { - printk("DDEKit slab init failed\n"); - ddekit_simple_free(cache); - return 0; - } - - cache->name = name; - cache->size = size; - cache->ctor = ctor; - - ddekit_lock_init_unlocked(&cache->cache_lock); - - return cache; -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/local.h.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/local.h.svn-base deleted file mode 100644 index 35b3e449..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/local.h.svn-base +++ /dev/null @@ -1,99 +0,0 @@ -#ifndef __DDE26_LOCAL_H -#define __DDE26_LOCAL_H - -#include <linux/sched.h> - -#include <l4/dde/ddekit/assert.h> -#include <l4/dde/ddekit/condvar.h> -#include <l4/dde/ddekit/debug.h> -#include <l4/dde/ddekit/initcall.h> -#include <l4/dde/ddekit/interrupt.h> -#include <l4/dde/ddekit/lock.h> -#include <l4/dde/ddekit/memory.h> -#include <l4/dde/ddekit/panic.h> -#include <l4/dde/ddekit/pci.h> -#include <l4/dde/ddekit/pgtab.h> -#include <l4/dde/ddekit/printf.h> -#include <l4/dde/ddekit/resources.h> -#include <l4/dde/ddekit/semaphore.h> -#include <l4/dde/ddekit/thread.h> -#include <l4/dde/ddekit/types.h> -#include <l4/dde/ddekit/timer.h> - -#include <l4/dde/linux26/dde26.h> - -#define DDE_DEBUG 1 -#define DDE_FERRET 0 - -/* Ferret Debugging stuff, note that this is the only point we are using - * L4 headers directly and only for debugging. */ -#if DDE_FERRET -#include <l4/ferret/maj_min.h> -#include <l4/ferret/client.h> -#include <l4/ferret/clock.h> -#include <l4/ferret/types.h> -#include <l4/ferret/sensors/list_producer.h> -#include <l4/ferret/sensors/list_producer_wrap.h> -extern ferret_list_local_t *ferret_ore_sensor; -#endif - -/*** - * Internal representation of a Linux kernel thread. This struct - * contains Linux' data as well as some additional data used by DDE. - */ -typedef struct dde26_thread_data -{ - /* NOTE: _threadinfo needs to be first in this struct! */ - struct thread_info _thread_info; ///< Linux thread info (see current()) - ddekit_thread_t *_ddekit_thread; ///< underlying DDEKit thread - ddekit_sem_t *_sleep_lock; ///< lock used for sleep_interruptible() - struct pid _vpid; ///< virtual PID -} dde26_thread_data; - -#define LX_THREAD(thread_data) ((thread_data)->_thread_info) -#define LX_TASK(thread_data) ((thread_data)->_thread_info.task) -#define DDEKIT_THREAD(thread_data) ((thread_data)->_ddekit_thread) -#define SLEEP_LOCK(thread_data) ((thread_data)->_sleep_lock) -#define VPID_P(thread_data) (&(thread_data)->_vpid) - -#if DDE_DEBUG -#define WARN_UNIMPL printk("unimplemented: %s\n", __FUNCTION__) -#define DEBUG_MSG(msg, ...) printk("%s: \033[36m"msg"\033[0m\n", __FUNCTION__, ##__VA_ARGS__) - -#define DECLARE_INITVAR(name) \ - static struct { \ - int _initialized; \ - char *name; \ - } init_##name = {0, #name,} - -#define INITIALIZE_INITVAR(name) init_##name._initialized = 1 - -#define CHECK_INITVAR(name) \ - if (init_##name._initialized == 0) { \ - printk("DDE26: \033[31;1mUsing uninitialized subsystem: "#name"\033[0m\n"); \ - BUG(); \ - } - -#else /* !DDE_DEBUG */ - -#define WARN_UNIMPL do {} while(0) -#define DEBUG_MSG(...) do {} while(0) -#define DECLARE_INITVAR(name) -#define CHECK_INITVAR(name) do {} while(0) -#define INITIALIZE_INITVAR(name) do {} while(0) - -#endif - -/* since _thread_info always comes first in the thread_data struct, - * we can derive the dde26_thread_data from a task struct by simply - * dereferencing its thread_info pointer - */ -static dde26_thread_data *lxtask_to_ddethread(struct task_struct *t) -{ - return (dde26_thread_data *)(task_thread_info(t)); -} - -extern struct thread_info init_thread; -extern struct task_struct init_task; - -#endif diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/mm-helper.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/mm-helper.c.svn-base deleted file mode 100644 index 68c0213b..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/mm-helper.c.svn-base +++ /dev/null @@ -1,45 +0,0 @@ -/* Linux */ -#include <linux/gfp.h> -#include <linux/string.h> -#include <asm/page.h> - -/* DDEKit */ -#include <l4/dde/ddekit/memory.h> -#include <l4/dde/ddekit/assert.h> -#include <l4/dde/ddekit/panic.h> - -#include "local.h" - -int ioprio_best(unsigned short aprio, unsigned short bprio) -{ - WARN_UNIMPL; - return 0; -} - -void *__alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal) -{ - WARN_UNIMPL; - return 0; -} - -/* - * Stolen from linux-2.6.29/fs/libfs.c - */ -ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, - const void *from, size_t available) -{ - loff_t pos = *ppos; - if (pos < 0) - return -EINVAL; - if (pos > available) - return 0; - if (count > available - pos) - count = available - pos; - memcpy(to, from + pos, count); - *ppos = pos + count; - - return count; -} - -int capable(int f) { return 1; } diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/net.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/net.c.svn-base deleted file mode 100644 index d6637d96..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/net.c.svn-base +++ /dev/null @@ -1,36 +0,0 @@ -/****************************************************************************** - * DDELinux networking utilities. * - * * - * Bjoern Doebel <doebel@tudos.org> * - * * - * (c) 2005 - 2007 Technische Universitaet Dresden * - * This file is part of DROPS, which is distributed under the terms of the * - * GNU General Public License 2. Please see the COPYING file for details. * - ******************************************************************************/ - -#include <l4/dde/linux26/dde26_net.h> - -#include <linux/kernel.h> -#include <linux/skbuff.h> - -#include "local.h" - - -/* Callback function to be called if a network packet arrives and needs to - * be handled by netif_rx() or netif_receive_skb() - */ -linux_rx_callback l4dde26_rx_callback = NULL; - - -/* Register a netif_rx callback function. - * - * \return pointer to old callback function - */ -linux_rx_callback l4dde26_register_rx_callback(linux_rx_callback cb) -{ - linux_rx_callback old = l4dde26_rx_callback; - l4dde26_rx_callback = cb; - DEBUG_MSG("New rx callback @ %p.", cb); - - return old; -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/page_alloc.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/page_alloc.c.svn-base deleted file mode 100644 index 0a2e3fdf..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/page_alloc.c.svn-base +++ /dev/null @@ -1,281 +0,0 @@ -/* - * \brief Page allocation - * \author Christian Helmuth <ch12@tudos.org> - * Bjoern Doebel <doebel@tudos.org> - * \date 2007-01-22 - * - * In Linux 2.6 this resides in mm/page_alloc.c. - * - * This implementation is far from complete as it does not cover "struct page" - * emulation. In Linux, there's an array of structures for all pages. In - * particular, iteration works for this array like: - * - * struct page *p = alloc_pages(3); // p refers to first page of allocation - * ++p; // p refers to second page - * - * There may be more things to cover and we should have a deep look into the - * kernel parts we want to reuse. Candidates for problems may be file systems, - * storage (USB, IDE), and video (bttv). - */ - -/* Linux */ -#include <linux/gfp.h> -#include <linux/string.h> -#include <linux/pagevec.h> -#include <linux/mm.h> -#include <asm/page.h> - -/* DDEKit */ -#include <l4/dde/ddekit/memory.h> -#include <l4/dde/ddekit/assert.h> -#include <l4/dde/ddekit/panic.h> - -#include "local.h" - -unsigned long max_low_pfn; -unsigned long min_low_pfn; -unsigned long max_pfn; - -/******************* - ** Configuration ** - *******************/ - -#define DEBUG_PAGE_ALLOC 0 - - -/* - * DDE page cache - * - * We need to store all pages somewhere (which in the Linux kernel is - * performed by the huge VM infrastructure. Purpose for us is: - * - make virt_to_phys() work - * - enable external clients to hand in memory (e.g., a dm_phys - * dataspace and make it accessible as Linux pages to the DDE) - */ - -#define DDE_PAGE_CACHE_SHIFT 10 -#define DDE_PAGE_CACHE_SIZE (1 << DDE_PAGE_CACHE_SHIFT) -#define DDE_PAGE_CACHE_MASK (DDE_PAGE_CACHE_SIZE - 1) - -typedef struct -{ - struct hlist_node list; - struct page *page; -} page_cache_entry; - -static struct hlist_head dde_page_cache[DDE_PAGE_CACHE_SIZE]; - -/** Hash function to map virtual addresses to page cache buckets. */ -#define VIRT_TO_PAGEHASH(a) ((((unsigned long)a) >> PAGE_SHIFT) & DDE_PAGE_CACHE_MASK) - - -void dde_page_cache_add(struct page *p) -{ - unsigned int hashval = VIRT_TO_PAGEHASH(p->virtual); - - page_cache_entry *e = kmalloc(sizeof(page_cache_entry), GFP_KERNEL); - -#if DEBUG_PAGE_ALLOC - DEBUG_MSG("virt %p, hash: %x", p->virtual, hashval); -#endif - - e->page = p; - INIT_HLIST_NODE(&e->list); - - hlist_add_head(&e->list, &dde_page_cache[hashval]); -} - - -void dde_page_cache_remove(struct page *p) -{ - unsigned int hashval = VIRT_TO_PAGEHASH(p->virtual); - struct hlist_node *hn = NULL; - struct hlist_head *h = &dde_page_cache[hashval]; - page_cache_entry *e = NULL; - struct hlist_node *v = NULL; - - hlist_for_each_entry(e, hn, h, list) { - if ((unsigned long)e->page->virtual == ((unsigned long)p->virtual & PAGE_MASK)) - v = hn; - break; - } - - if (v) { -#if DEBUG_PAGE_ALLOC - DEBUG_MSG("deleting node %p which contained page %p", v, p); -#endif - hlist_del(v); - } -} - - -struct page* dde_page_lookup(unsigned long va) -{ - unsigned int hashval = VIRT_TO_PAGEHASH(va); - - struct hlist_node *hn = NULL; - struct hlist_head *h = &dde_page_cache[hashval]; - page_cache_entry *e = NULL; - - hlist_for_each_entry(e, hn, h, list) { - if ((unsigned long)e->page->virtual == (va & PAGE_MASK)) - return e->page; - } - - return NULL; -} - - -struct page * __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nm) -{ - /* XXX: In fact, according to order, we should have one struct page - * for every page, not only for the first one. - */ - struct page *ret = kmalloc(sizeof(*ret), GFP_KERNEL); - - ret->virtual = (void *)__get_free_pages(gfp_mask, order); - dde_page_cache_add(ret); - - return ret; -} - - -unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) -{ - ddekit_log(DEBUG_PAGE_ALLOC, "gfp_mask=%x order=%d (%d bytes)", - gfp_mask, order, PAGE_SIZE << order); - - Assert(gfp_mask != GFP_DMA); - void *p = ddekit_large_malloc(PAGE_SIZE << order); - - return (unsigned long)p; -} - - -unsigned long get_zeroed_page(gfp_t gfp_mask) -{ - unsigned long p = __get_free_pages(gfp_mask, 0); - - if (p) memset((void *)p, 0, PAGE_SIZE); - - return (unsigned long)p; -} - - -void free_hot_page(struct page *page) -{ - WARN_UNIMPL; -} - -/* - * XXX: If alloc_pages() gets fixed to allocate a page struct per page, - * this needs to be adapted, too. - */ -void __free_pages(struct page *page, unsigned int order) -{ - free_pages((unsigned long)page->virtual, order); - dde_page_cache_remove(page); -} - -void __pagevec_free(struct pagevec *pvec) -{ - WARN_UNIMPL; -} - -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) -{ - WARN_UNIMPL; - return 0; -} - -/** - * ... - * - * XXX order may be larger than allocation at 'addr' - it may comprise several - * allocation via __get_free_pages()! - */ -void free_pages(unsigned long addr, unsigned int order) -{ - ddekit_log(DEBUG_PAGE_ALLOC, "addr=%p order=%d", (void *)addr, order); - - ddekit_large_free((void *)addr); -} - - -unsigned long __pa(volatile void *addr) -{ - return ddekit_pgtab_get_physaddr((void*)addr); -} - -void *__va(unsigned long addr) -{ - return (void*)ddekit_pgtab_get_virtaddr((ddekit_addr_t) addr); -} - - -int set_page_dirty_lock(struct page *page) -{ - WARN_UNIMPL; - return 0; -} - - -/* - * basically copied from linux/mm/page_alloc.c - */ -void *__init alloc_large_system_hash(const char *tablename, - unsigned long bucketsize, - unsigned long numentries, - int scale, - int flags, - unsigned int *_hash_shift, - unsigned int *_hash_mask, - unsigned long limit) -{ - void * table = NULL; - unsigned long log2qty; - unsigned long size; - - if (numentries == 0) - numentries = 1024; - - log2qty = ilog2(numentries); - size = bucketsize << log2qty; - - do { - unsigned long order; - for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++); - table = (void*) __get_free_pages(GFP_ATOMIC, order); - } while (!table && size > PAGE_SIZE && --log2qty); - - if (!table) - panic("Failed to allocate %s hash table\n", tablename); - - printk("%s hash table entries: %d (order: %d, %lu bytes)\n", - tablename, - (1U << log2qty), - ilog2(size) - PAGE_SHIFT, - size); - - if (_hash_shift) - *_hash_shift = log2qty; - if (_hash_mask) - *_hash_mask = (1 << log2qty) - 1; - - return table; -} - - -static void __init dde_page_cache_init(void) -{ - printk("Initializing DDE page cache\n"); - int i=0; - - for (i; i < DDE_PAGE_CACHE_SIZE; ++i) - INIT_HLIST_HEAD(&dde_page_cache[i]); -} - -core_initcall(dde_page_cache_init); diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/param.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/param.c.svn-base deleted file mode 100644 index 5bd83f32..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/param.c.svn-base +++ /dev/null @@ -1,32 +0,0 @@ -#include <linux/moduleparam.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/err.h> -#include <linux/slab.h> - -/* Lazy bastard, eh? */ -#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ - int param_set_##name(const char *val, struct kernel_param *kp) \ - { \ - return 0; \ - } \ - int param_get_##name(char *buffer, struct kernel_param *kp) \ - { \ - return 0;\ - } - -STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul); - -int printk_ratelimit(void) -{ - return 0; -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/pci.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/pci.c.svn-base deleted file mode 100644 index 2a0391f2..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/pci.c.svn-base +++ /dev/null @@ -1,189 +0,0 @@ -#include "local.h" - -#include <linux/delay.h> -#include <linux/pci.h> -#include <linux/list.h> -#include <linux/init.h> - -/* will include $(CONTRIB)/drivers/pci/pci.h */ -#include "pci.h" - -DECLARE_INITVAR(dde26_pci); - -/** PCI device descriptor */ -typedef struct l4dde_pci_dev { - struct list_head next; /**< chain info */ - struct ddekit_pci_dev *ddekit_dev; /**< corresponding DDEKit descriptor */ - struct pci_dev *linux_dev; /**< Linux descriptor */ -} l4dde_pci_dev_t; - - -/******************************************************************************************* - ** PCI data ** - *******************************************************************************************/ -/** List of Linux-DDEKit PCIDev mappings */ -static LIST_HEAD(pcidev_mappings); - -/** PCI bus */ -static struct pci_bus *pci_bus = NULL; - -static int l4dde26_pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val); -static int l4dde26_pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val); - -/** PCI operations for our virtual PCI bus */ -static struct pci_ops dde_pcibus_ops = { - .read = l4dde26_pci_read, - .write = l4dde26_pci_write, -}; - - -/******************************************************************************************* - ** Read/write PCI config space. This is simply mapped to the DDEKit functions. ** - *******************************************************************************************/ -static int l4dde26_pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) -{ - return ddekit_pci_read(bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val); -} - -static int l4dde26_pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) -{ - return ddekit_pci_write(bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val); -} - -int pci_irq_enable(struct pci_dev *dev) -{ - int irq = dev->irq; - int pin = 0; - int ret; - - DEBUG_MSG("dev %p", dev); - if (!dev) - return -EINVAL; - - pin = (int)dev->pin; - DEBUG_MSG("irq %d, pin %d", dev->irq, dev->pin); - if (!pin) { - dev_warn(&dev->dev, - "No interrupt pin configured for device %s\n", - pci_name(dev)); - return 0; - } - pin--; - - ret = ddekit_pci_irq_enable(dev->bus->number, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), pin, &irq); - if (ret) { - dev_warn(&dev->dev, "Interrupt enable failed for device %s (%d)\n", - pci_name(dev), ret); - return -1; - } - - dev_info(&dev->dev, "PCI INT %c -> GSI %d -> IRQ %d\n", - 'A' + pin, irq, dev->irq); - - dev->irq = irq; - return 0; -} - -int __pci_enable_device(struct pci_dev *dev) -{ - WARN_UNIMPL; - return 0; -} - - -/** - * pci_enable_device - Initialize device before it's used by a driver. - * - * Initialize device before it's used by a driver. Ask low-level code - * to enable I/O and memory. Wake up the device if it was suspended. - * Beware, this function can fail. - * - * \param dev PCI device to be initialized - * - */ -int -pci_enable_device(struct pci_dev *dev) -{ - CHECK_INITVAR(dde26_pci); -// WARN_UNIMPL; - return pci_irq_enable(dev); -} - - -/** - * pci_disable_device - Disable PCI device after use - * - * Signal to the system that the PCI device is not in use by the system - * anymore. This only involves disabling PCI bus-mastering, if active. - * - * \param dev PCI device to be disabled - */ -void pci_disable_device(struct pci_dev *dev) -{ - CHECK_INITVAR(dde26_pci); - WARN_UNIMPL; -} - - -void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev) -{ - //WARN_UNIMPL; -} - -void pci_set_master(struct pci_dev *dev) -{ - CHECK_INITVAR(dde26_pci); - WARN_UNIMPL; -} - - -int pci_create_sysfs_dev_files(struct pci_dev *pdev) -{ - return 0; -} - -unsigned int pcibios_assign_all_busses(void) -{ - return 1; -} - -void -pcibios_align_resource(void *data, struct resource *res, - resource_size_t size, resource_size_t align) -{ - WARN_UNIMPL; -} - -int pcibios_enable_device(struct pci_dev *dev, int mask) -{ -#if 0 - int err; - - if ((err = pcibios_enable_resources(dev, mask)) < 0) - return err; - - return pcibios_enable_irq(dev); -#endif - return 0; -} - -/******************************************************************************************* - ** Initialization function ** - *******************************************************************************************/ - -/** Initialize DDELinux PCI subsystem. - */ -void __init l4dde26_init_pci(void) -{ - ddekit_pci_init(); - - pci_bus = pci_create_bus(NULL, 0, &dde_pcibus_ops, NULL); - Assert(pci_bus); - - pci_do_scan_bus(pci_bus); - - INITIALIZE_INITVAR(dde26_pci); -} - -arch_initcall(l4dde26_init_pci); diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/power.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/power.c.svn-base deleted file mode 100644 index e36487bd..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/power.c.svn-base +++ /dev/null @@ -1,23 +0,0 @@ -/* Dummy functions for power management. */ - -#include "local.h" -#include <linux/device.h> - -int device_pm_add(struct device * dev) -{ - WARN_UNIMPL; - return 0; -} - - -void device_pm_remove(struct device * dev) -{ - WARN_UNIMPL; -} - -int pm_qos_add_requirement(int qos, char *name, s32 value) { return 0; } -int pm_qos_update_requirement(int qos, char *name, s32 new_value) { return 0; } -void pm_qos_remove_requirement(int qos, char *name) { } -int pm_qos_requirement(int qos) { return 0; } -int pm_qos_add_notifier(int qos, struct notifier_block *notifier) { return 0; } -int pm_qos_remove_notifier(int qos, struct notifier_block *notifier) { return 0; } diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/process.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/process.c.svn-base deleted file mode 100644 index 5fe43b32..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/process.c.svn-base +++ /dev/null @@ -1,347 +0,0 @@ -#include <l4/dde/dde.h> -#include <l4/dde/linux26/dde26.h> - -#include <asm/atomic.h> - -#include <linux/init_task.h> -#include <linux/kernel.h> -#include <linux/kthread.h> -#include <linux/list.h> -#include <linux/thread_info.h> -#include <linux/sched.h> -#include <linux/pid.h> -#include <linux/vmalloc.h> - -#include "local.h" - -/***************************************************************************** - ** Current() implementation ** - *****************************************************************************/ -struct thread_info *current_thread_info(void) -{ - dde26_thread_data *cur = (dde26_thread_data *)ddekit_thread_get_my_data(); - return &LX_THREAD(cur); -} - -struct task_struct *get_current(void) -{ - return current_thread_info()->task; -} - -/***************************************************************************** - ** PID-related stuff ** - ** ** - ** Linux manages lists of PIDs that are handed out to processes so that at ** - ** a later point it is able to determine which task_struct belongs to a ** - ** certain PID. We implement this with a single list holding the mappings ** - ** for all our threads. ** - *****************************************************************************/ - -LIST_HEAD(_pid_task_list); -ddekit_lock_t _pid_task_list_lock; - -/** PID to task_struct mapping */ -struct pid2task -{ - struct list_head list; /**< list data */ - struct pid *pid; /**< PID */ - struct task_struct *ts; /**< task struct */ -}; - -struct pid init_struct_pid = INIT_STRUCT_PID; - -void put_pid(struct pid *pid) -{ - if (pid) - atomic_dec(&pid->count); - // no freeing here, our struct pid's are always allocated as - // part of the dde26_thread_data -} - -/** Attach PID to a certain task struct. */ -void attach_pid(struct task_struct *task, enum pid_type type - __attribute__((unused)), struct pid *pid) -{ - /* Initialize a new pid2task mapping */ - struct pid2task *pt = kmalloc(sizeof(struct pid2task), GFP_KERNEL); - pt->pid = get_pid(pid); - pt->ts = task; - - /* add to list */ - ddekit_lock_lock(&_pid_task_list_lock); - list_add(&pt->list, &_pid_task_list); - ddekit_lock_unlock(&_pid_task_list_lock); -} - -/** Detach PID from a task struct. */ -void detach_pid(struct task_struct *task, enum pid_type type __attribute__((unused))) -{ - struct list_head *p, *n, *h; - - h = &_pid_task_list; - - ddekit_lock_lock(&_pid_task_list_lock); - /* search for mapping with given task struct and free it if necessary */ - list_for_each_safe(p, n, h) { - struct pid2task *pt = list_entry(p, struct pid2task, list); - if (pt->ts == task) { - put_pid(pt->pid); - list_del(p); - kfree(pt); - break; - } - } - ddekit_lock_unlock(&_pid_task_list_lock); -} - -struct task_struct *find_task_by_pid_type(int type, int nr) -{ - struct list_head *h, *p; - h = &_pid_task_list; - - ddekit_lock_lock(&_pid_task_list_lock); - list_for_each(p, h) { - struct pid2task *pt = list_entry(p, struct pid2task, list); - if (pid_nr(pt->pid) == nr) { - ddekit_lock_unlock(&_pid_task_list_lock); - return pt->ts; - } - } - ddekit_lock_unlock(&_pid_task_list_lock); - - return NULL; -} - - -struct task_struct *find_task_by_pid_ns(int nr, struct pid_namespace *ns) -{ - /* we don't implement PID name spaces */ - return find_task_by_pid_type(0, nr); -} - -struct task_struct *find_task_by_pid(int nr) -{ - return find_task_by_pid_type(0, nr); -} - -/***************************************************************************** - ** kernel_thread() implementation ** - *****************************************************************************/ -/* Struct containing thread data for a newly created kthread. */ -struct __kthread_data -{ - int (*fn)(void *); - void *arg; - ddekit_lock_t lock; - dde26_thread_data *kthread; -}; - -/** Counter for running kthreads. It is used to create unique names - * for kthreads. - */ -static atomic_t kthread_count = ATOMIC_INIT(0); - -/** Entry point for new kernel threads. Make this thread a DDE26 - * worker and then execute the real thread fn. - */ -static void __kthread_helper(void *arg) -{ - struct __kthread_data *k = (struct __kthread_data *)arg; - - /* - * Make a copy of the fn and arg pointers, as the kthread struct is - * deleted by our parent after notifying it and this may happen before we - * get to execute the function. - */ - int (*_fn)(void*) = k->fn; - void *_arg = k->arg; - - l4dde26_process_add_worker(); - - /* - * Handshake with creator - we store our thread data in the - * kthread struct and then unlock the lock to notify our - * creator about completing setup - */ - k->kthread = (dde26_thread_data *)ddekit_thread_get_my_data(); - ddekit_lock_unlock(&k->lock); - - do_exit(_fn(_arg)); -} - -/** Our implementation of Linux' kernel_thread() function. Setup a new - * thread running our __kthread_helper() function. - */ -int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) -{ - ddekit_thread_t *t; - char name[20]; - struct __kthread_data *kt = vmalloc(sizeof(struct __kthread_data)); - ddekit_lock_t lock; - - /* Initialize (and grab) handshake lock */ - ddekit_lock_init(&lock); - ddekit_lock_lock(&lock); - - int threadnum = atomic_inc_return(&kthread_count); - kt->fn = fn; - kt->arg = arg; - kt->lock = lock; // Copy lock ptr, note that kt is freed by the - // new thread, so we MUST NOT use kt->lock after - // this point! - - snprintf(name, 20, ".kthread%x", threadnum); - t = ddekit_thread_create(__kthread_helper, - (void *)kt, name); - Assert(t); - - ddekit_lock_lock(&lock); - ddekit_lock_deinit(&lock); - - return pid_nr(VPID_P(kt->kthread)); -} - -/** Our implementation of exit(). For DDE purposes this only relates - * to kernel threads. - */ -void do_exit(long code) -{ - ddekit_thread_t *t = DDEKIT_THREAD(lxtask_to_ddethread(current)); -// printk("Thread %s exits with code %x\n", ddekit_thread_get_name(t), code); - - /* do some cleanup */ - detach_pid(current, 0); - - /* goodbye, cruel world... */ - ddekit_thread_exit(); -} - -/***************************************************************************** - ** Misc functions ** - *****************************************************************************/ - -void dump_stack(void) -{ -} - - -char *get_task_comm(char *buf, struct task_struct *tsk) -{ - char *ret; - /* buf must be at least sizeof(tsk->comm) in size */ - task_lock(tsk); - ret = strncpy(buf, tsk->comm, sizeof(tsk->comm)); - task_unlock(tsk); - return ret; -} - - -void set_task_comm(struct task_struct *tsk, char *buf) -{ - task_lock(tsk); - strlcpy(tsk->comm, buf, sizeof(tsk->comm)); - task_unlock(tsk); -} - - -/***************************************************************************** - ** DDEKit gluecode, init functions ** - *****************************************************************************/ -/* Initialize a dde26 thread. - * - * - Allocate thread data, as well as a Linux task struct, - * - Fill in default values for thread_info, and task, - * - Adapt task struct's thread_info backreference - * - Initialize the DDE sleep lock - */ -static dde26_thread_data *init_dde26_thread(void) -{ - /* - * Virtual PID counter - */ - static atomic_t pid_counter = ATOMIC_INIT(0); - dde26_thread_data *t = vmalloc(sizeof(dde26_thread_data)); - Assert(t); - - memcpy(&t->_vpid, &init_struct_pid, sizeof(struct pid)); - t->_vpid.numbers[0].nr = atomic_inc_return(&pid_counter); - - memcpy(&LX_THREAD(t), &init_thread, sizeof(struct thread_info)); - - LX_TASK(t) = vmalloc(sizeof(struct task_struct)); - Assert(LX_TASK(t)); - - memcpy(LX_TASK(t), &init_task, sizeof(struct task_struct)); - - /* nice: Linux backreferences a task`s thread_info from the - * task struct (which in turn can be found using the - * thread_info...) */ - LX_TASK(t)->stack = &LX_THREAD(t); - - /* initialize this thread's sleep lock */ - SLEEP_LOCK(t) = ddekit_sem_init(0); - - return t; -} - -/* Process setup for worker threads */ -int l4dde26_process_add_worker(void) -{ - dde26_thread_data *cur = init_dde26_thread(); - - /* If this function is called for a kernel_thread, the thread already has - * been set up and we just need to store a reference to the ddekit struct. - * However, this function may also be called directly to turn an L4 thread - * into a DDE thread. Then, we need to initialize here. */ - cur->_ddekit_thread = ddekit_thread_myself(); - if (cur->_ddekit_thread == NULL) - cur->_ddekit_thread = ddekit_thread_setup_myself(".dde26_thread"); - Assert(cur->_ddekit_thread); - - ddekit_thread_set_my_data(cur); - - attach_pid(LX_TASK(cur), 0, &cur->_vpid); - - /* Linux' default is to have this set to 1 initially and let the - * scheduler set this to 0 later on. - */ - current_thread_info()->preempt_count = 0; - - return 0; -} - - -/** - * Add an already existing DDEKit thread to the set of threads known to the - * Linux environment. This is used for the timer thread, which is actually a - * DDEKit thread, but Linux code shall see it as a Linux thread as well. - */ -int l4dde26_process_from_ddekit(ddekit_thread_t *t) -{ - Assert(t); - - dde26_thread_data *cur = init_dde26_thread(); - cur->_ddekit_thread = t; - ddekit_thread_set_data(t, cur); - attach_pid(LX_TASK(cur), 0, &cur->_vpid); - - return 0; -} - -/** Function to initialize the first DDE process. - */ -int __init l4dde26_process_init(void) -{ - ddekit_lock_init_unlocked(&_pid_task_list_lock); - - int kthreadd_pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); - kthreadd_task = find_task_by_pid(kthreadd_pid); - - l4dde26_process_add_worker(); - - return 0; -} - -DEFINE_PER_CPU(int, cpu_number); - -//dde_process_initcall(l4dde26_process_init); diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/res.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/res.c.svn-base deleted file mode 100644 index fbd2d09b..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/res.c.svn-base +++ /dev/null @@ -1,180 +0,0 @@ -#include "local.h" - -#include <linux/ioport.h> - -/** Request an IO port region. - * - * \param start start port - * \param n number of ports - * \param name name of allocator (unused) - * - * \return NULL error - * \return !=NULL success - * - * \bug Since no one in Linux uses this function's return value, - * we do not allocate and fill a resource struct. - */ -static struct resource *l4dde26_request_region(resource_size_t start, - resource_size_t n, - const char *name) -{ - int err = ddekit_request_io(start, n); - - if (err) - return NULL; - - return (struct resource *)1; -} - - -/** List of memory regions that have been requested. This is used to - * perform ioremap() and iounmap() - */ -static LIST_HEAD(dde_mem_regions); - -/** va->pa mapping used to store memory regions */ -struct dde_mem_region { - ddekit_addr_t pa; - ddekit_addr_t va; - unsigned int size; - struct list_head list; -}; - -void __iomem * ioremap(unsigned long phys_addr, unsigned long size); - -/** Request an IO memory region. - * - * \param start start address - * \param n size of memory area - * \param name name of allocator (unused) - * - * \return NULL error - * \return !=NULL success - * - * \bug Since no one in Linux uses this function's return value, - * we do not allocate and fill a resource struct. - */ -static struct resource *l4dde26_request_mem_region(resource_size_t start, - resource_size_t n, - const char *name) -{ - ddekit_addr_t va = 0; - struct dde_mem_region *mreg; - - // do not a resource request twice - if (ioremap(start, n)) - return (struct resource *)1; - - int i = ddekit_request_mem(start, n, &va); - - if (i) { - ddekit_printf("request_mem_region() failed (start %lx, size %x)", start, n); - return NULL; - } - - mreg = kmalloc(sizeof(struct dde_mem_region), GFP_KERNEL); - Assert(mreg); - - mreg->pa = start; - mreg->va = va; - mreg->size = n; - list_add(&mreg->list, &dde_mem_regions); - -#if 0 - ddekit_pgtab_set_region_with_size((void *)va, start, n, PTE_TYPE_OTHER); -#endif - - return (struct resource *)1; -} - - -struct resource * __request_region(struct resource *parent, - resource_size_t start, - resource_size_t n, - const char *name, int flags) -{ - Assert(parent); - Assert(parent->flags & IORESOURCE_IO || parent->flags & IORESOURCE_MEM); - - switch (parent->flags) - { - case IORESOURCE_IO: - return l4dde26_request_region(start, n, name); - case IORESOURCE_MEM: - return l4dde26_request_mem_region(start, n, name); - } - - return NULL; -} - - -/** Release IO port region. - */ -static void l4dde26_release_region(resource_size_t start, resource_size_t n) -{ - /* FIXME: we need a list of "struct resource"s that have been - * allocated by request_region() and then need to - * free this stuff here! */ - ddekit_release_io(start, n); -} - - -/** Release IO memory region. - */ -static void l4dde26_release_mem_region(resource_size_t start, resource_size_t n) -{ - ddekit_release_mem(start, n); - ddekit_pgtab_clear_region((void *)start, PTE_TYPE_OTHER); -} - - -int __check_region(struct resource *root, resource_size_t s, resource_size_t n) -{ - WARN_UNIMPL; - return -1; -} - -void __release_region(struct resource *root, resource_size_t start, - resource_size_t n) -{ - switch (root->flags) - { - case IORESOURCE_IO: - return l4dde26_release_region(start, n); - case IORESOURCE_MEM: - return l4dde26_release_mem_region(start, n); - } -} - - -/** Map physical I/O region into virtual address space. - * - * For our sake, this only returns the virtual address belonging to - * the physical region, since we don't manage page tables ourselves. - */ -void __iomem * ioremap(unsigned long phys_addr, unsigned long size) -{ - struct list_head *pos, *head; - head = &dde_mem_regions; - - list_for_each(pos, head) { - struct dde_mem_region *mreg = list_entry(pos, struct dde_mem_region, - list); - if (mreg->pa <= phys_addr && mreg->pa + mreg->size >= phys_addr + size) - return (void *)(mreg->va + (phys_addr - mreg->pa)); - } - - return NULL; -} - - -void __iomem * ioremap_nocache(unsigned long offset, unsigned long size) -{ - return ioremap(offset, size); -} - - -void iounmap(volatile void __iomem *addr) -{ - WARN_UNIMPL; -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/sched.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/sched.c.svn-base deleted file mode 100644 index b38520c6..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/sched.c.svn-base +++ /dev/null @@ -1,155 +0,0 @@ -#include "local.h" - -#include <linux/sched.h> - -DEFINE_RWLOCK(tasklist_lock); - -asmlinkage void preempt_schedule(void) -{ - WARN_UNIMPL; -} - - -/* Our version of scheduler invocation. - * - * Scheduling is performed by Fiasco, so we don't care about it as long as - * a thread is running. If a task becomes TASK_INTERRUPTIBLE or - * TASK_UNINTERRUPTIBLE, we make sure that the task does not become - * scheduled by locking the task's sleep lock. - */ -asmlinkage void schedule(void) -{ - dde26_thread_data *t = lxtask_to_ddethread(current); - - switch (current->state) { - case TASK_RUNNING: - ddekit_thread_schedule(); - break; - case TASK_INTERRUPTIBLE: - case TASK_UNINTERRUPTIBLE: - ddekit_sem_down(SLEEP_LOCK(t)); - break; - default: - panic("current->state = %d --- unknown state\n", current->state); - } -} - - -/** yield the current processor to other threads. - * - * this is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - ddekit_yield(); -} - - -/*** - * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread - * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? - */ -int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) -{ - Assert(p); - dde26_thread_data *t = lxtask_to_ddethread(p); - - Assert(t); - Assert(SLEEP_LOCK(t)); - - p->state = TASK_RUNNING; - ddekit_sem_up(SLEEP_LOCK(t)); - - return 0; -} - - -static void process_timeout(unsigned long data) -{ - wake_up_process((struct task_struct *)data); -} - - -signed long __sched schedule_timeout(signed long timeout) -{ - struct timer_list timer; - unsigned long expire = timeout + jiffies; - - setup_timer(&timer, process_timeout, (unsigned long)current); - timer.expires = expire; - - switch(timeout) - { - /* - * Hah! - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. - */ - case MAX_SCHEDULE_TIMEOUT: - schedule(); - break; - default: - add_timer(&timer); - schedule(); - del_timer(&timer); - break; - } - - timeout = expire - jiffies; - - return timeout < 0 ? 0 : timeout; -} - - -signed long __sched schedule_timeout_interruptible(signed long timeout) -{ - __set_current_state(TASK_INTERRUPTIBLE); - return schedule_timeout(timeout); -} - - -signed long __sched schedule_timeout_uninterruptible(signed long timeout) -{ - __set_current_state(TASK_UNINTERRUPTIBLE); - return schedule_timeout(timeout); -} - -/** Tasks may be forced to run only on a certain no. of CPUs. Since - * we only emulate a SMP-environment for the sake of having multiple - * threads, we do not need to implement this. - */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - return 0; -} - -void set_user_nice(struct task_struct *p, long nice) -{ - //WARN_UNIMPL; -} - -void __sched io_schedule(void) -{ - WARN_UNIMPL; -} - -long __sched io_schedule_timeout(long timeout) -{ - WARN_UNIMPL; - return -1; -} - -extern int sched_setscheduler_nocheck(struct task_struct *t, int flags, - struct sched_param *p) -{ - WARN_UNIMPL; - return -1; -} - -void ignore_signals(struct task_struct *t) { } diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/signal.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/signal.c.svn-base deleted file mode 100644 index bd0bc0a7..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/signal.c.svn-base +++ /dev/null @@ -1,24 +0,0 @@ -#include "local.h" - -/****************************************************************************** - ** Dummy signal implementation. ** - ** DDE does not provide its own signal implementation. To make it compile, ** - ** we provide dummy versions of signalling functions here. If later on ** - ** someone *REALLY* wants to use signals in the DDE context, he might ** - ** erase this file and use something like the L4 signalling library for ** - ** such purposes. ** -*******************************************************************************/ - -int sigprocmask(int how, sigset_t *set, sigset_t *oldset) -{ - return 0; -} - -void flush_signals(struct task_struct *t) -{ -} - -int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) -{ - return 0; -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/smp.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/smp.c.svn-base deleted file mode 100644 index 1ebf08c2..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/smp.c.svn-base +++ /dev/null @@ -1,37 +0,0 @@ -#include <linux/cpumask.h> - -#include "local.h" - -static struct cpumask _possible = CPU_MASK_ALL; -static struct cpumask _online = CPU_MASK_CPU0; -static struct cpumask _present = CPU_MASK_CPU0; -static struct cpumask _active = CPU_MASK_CPU0; - -const struct cpumask *const cpu_possible_mask = &_possible; -const struct cpumask *const cpu_online_mask = &_online; -const struct cpumask *const cpu_present_mask = &_present; -const struct cpumask *const cpu_active_mask = &_active; - -cpumask_t cpu_mask_all = CPU_MASK_ALL; -int nr_cpu_ids = NR_CPUS; -const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); - -/* cpu_bit_bitmap[0] is empty - so we can back into it */ -#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) -#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) -#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) -#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) - -const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { - MASK_DECLARE_8(0), MASK_DECLARE_8(8), - MASK_DECLARE_8(16), MASK_DECLARE_8(24), -#if BITS_PER_LONG > 32 - MASK_DECLARE_8(32), MASK_DECLARE_8(40), - MASK_DECLARE_8(48), MASK_DECLARE_8(56), -#endif -}; - -void __smp_call_function_single(int cpuid, struct call_single_data *data) -{ - data->func(data->info); -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/softirq.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/softirq.c.svn-base deleted file mode 100644 index 21b36d17..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/softirq.c.svn-base +++ /dev/null @@ -1,267 +0,0 @@ -#include "local.h" - -#include <linux/interrupt.h> - -/* There are at most 32 softirqs in Linux, but only 6 are really used. */ -#define NUM_SOFTIRQS 6 - -DECLARE_INITVAR(dde26_softirq); - -/* softirq threads and their wakeup semaphores */ -ddekit_thread_t *dde_softirq_thread; -ddekit_sem_t *dde_softirq_sem; - -/* struct tasklet_head is not defined in a header in Linux 2.6 */ -struct tasklet_head -{ - struct tasklet_struct *list; - ddekit_lock_t lock; /* list lock */ -}; - -/* What to do if a softirq occurs. */ -static struct softirq_action softirq_vec[32]; - -/* tasklet queues for each softirq thread */ -struct tasklet_head tasklet_vec; -struct tasklet_head tasklet_hi_vec; - -void open_softirq(int nr, void (*action)(struct softirq_action*)) -{ - softirq_vec[nr].action = action; -} - -static void raise_softirq_irqoff_cpu(unsigned int nr, unsigned int cpu) -{ - CHECK_INITVAR(dde26_softirq); - - /* mark softirq scheduled */ - __raise_softirq_irqoff(nr); - /* wake softirq thread */ - ddekit_sem_up(dde_softirq_sem); -} - -void raise_softirq_irqoff(unsigned int nr) -{ - raise_softirq_irqoff_cpu(nr, 0); -} - -void raise_softirq(unsigned int nr) -{ - unsigned long flags; - - local_irq_save(flags); - raise_softirq_irqoff(nr); - local_irq_restore(flags); -} - -/** - * Initialize tasklet. - */ -void tasklet_init(struct tasklet_struct *t, - void (*func)(unsigned long), unsigned long data) -{ - t->next = NULL; - t->state = 0; - atomic_set(&t->count, 0); - t->func = func; - t->data = data; -} - -/* enqueue tasklet */ -static void __tasklet_enqueue(struct tasklet_struct *t, - struct tasklet_head *listhead) -{ - ddekit_lock_lock(&listhead->lock); - t->next = listhead->list; - listhead->list = t; - ddekit_lock_unlock(&listhead->lock); -} - -void __tasklet_schedule(struct tasklet_struct *t) -{ - unsigned long flags; - - CHECK_INITVAR(dde26_softirq); - - local_irq_save(flags); - - __tasklet_enqueue(t, &tasklet_vec); - /* raise softirq */ - raise_softirq_irqoff_cpu(TASKLET_SOFTIRQ, 0); - - local_irq_restore(flags); -} - -void __tasklet_hi_schedule(struct tasklet_struct *t) -{ - unsigned long flags; - - CHECK_INITVAR(dde26_softirq); - - local_irq_save(flags); - __tasklet_enqueue(t, &tasklet_hi_vec); - raise_softirq_irqoff_cpu(HI_SOFTIRQ, 0); - local_irq_restore(flags); -} - -/* Execute tasklets */ -static void tasklet_action(struct softirq_action *a) -{ - struct tasklet_struct *list; - - ddekit_lock_lock(&tasklet_vec.lock); - list = tasklet_vec.list; - tasklet_vec.list = NULL; - ddekit_lock_unlock(&tasklet_vec.lock); - - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } - - ddekit_lock_lock(&tasklet_vec.lock); - t->next = tasklet_vec.list; - tasklet_vec.list = t; - raise_softirq_irqoff_cpu(TASKLET_SOFTIRQ, 0); - ddekit_lock_unlock(&tasklet_vec.lock); - } -} - - -static void tasklet_hi_action(struct softirq_action *a) -{ - struct tasklet_struct *list; - - ddekit_lock_lock(&tasklet_hi_vec.lock); - list = tasklet_hi_vec.list; - tasklet_hi_vec.list = NULL; - ddekit_lock_unlock(&tasklet_hi_vec.lock); - - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } - - ddekit_lock_lock(&tasklet_hi_vec.lock); - t->next = tasklet_hi_vec.list; - tasklet_hi_vec.list = t; - raise_softirq_irqoff_cpu(HI_SOFTIRQ, 0); - ddekit_lock_unlock(&tasklet_hi_vec.lock); - } -} - - -#define MAX_SOFTIRQ_RETRIES 10 - -/** Run softirq handlers - */ -void __do_softirq(void) -{ - int retries = MAX_SOFTIRQ_RETRIES; - do { - struct softirq_action *h = softirq_vec; - unsigned long pending = local_softirq_pending(); - - /* reset softirq count */ - set_softirq_pending(0); - - /* While we have a softirq pending... */ - while (pending) { - /* need to execute current softirq? */ - if (pending & 1) - h->action(h); - /* try next softirq */ - h++; - /* remove pending flag for last softirq */ - pending >>= 1; - } - - /* Somebody might have scheduled another softirq in between - * (e.g., an IRQ thread or another tasklet). */ - } while (local_softirq_pending() && --retries); - -} - - -void do_softirq(void) -{ - unsigned long flags; - - local_irq_save(flags); - if (local_softirq_pending()) - __do_softirq(); - local_irq_restore(flags); -} - -/** Softirq thread function. - * - * Once started, a softirq thread waits for tasklets to be scheduled - * and executes them. - * - * \param arg # of this softirq thread so that it grabs the correct lock - * if multiple softirq threads are running. - */ -void l4dde26_softirq_thread(void *arg) -{ - printk("Softirq daemon starting\n"); - l4dde26_process_add_worker(); - - /* This thread will always be in a softirq, so set the - * corresponding flag right now. - */ - preempt_count() |= SOFTIRQ_MASK; - - while(1) { - ddekit_sem_down(dde_softirq_sem); - do_softirq(); - } -} - -/** Initialize softirq subsystem. - * - * Start NUM_SOFTIRQ_THREADS threads executing the \ref l4dde26_softirq_thread - * function. - */ -void l4dde26_softirq_init(void) -{ - char name[20]; - - dde_softirq_sem = ddekit_sem_init(0); - - set_softirq_pending(0); - - ddekit_lock_init_unlocked(&tasklet_vec.lock); - ddekit_lock_init_unlocked(&tasklet_hi_vec.lock); - - snprintf(name, 20, ".softirqd"); - dde_softirq_thread = ddekit_thread_create( - l4dde26_softirq_thread, - NULL, name); - - open_softirq(TASKLET_SOFTIRQ, tasklet_action); - open_softirq(HI_SOFTIRQ, tasklet_hi_action); - - INITIALIZE_INITVAR(dde26_softirq); -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/timer.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/timer.c.svn-base deleted file mode 100644 index ea04b67e..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/timer.c.svn-base +++ /dev/null @@ -1,184 +0,0 @@ -#include "local.h" - -#include <linux/timer.h> -#include <linux/fs.h> -#include <asm/delay.h> - -DECLARE_INITVAR(dde26_timer); - -/* Definitions from linux/kernel/timer.c */ - -/* - * per-CPU timer vector definitions: - */ -#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) -#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) - -typedef struct tvec_s { - struct list_head vec[TVN_SIZE]; -} tvec_t; - -typedef struct tvec_root_s { - struct list_head vec[TVR_SIZE]; -} tvec_root_t; - -struct tvec_base { - spinlock_t lock; - struct timer_list *running_timer; - unsigned long timer_jiffies; - tvec_root_t tv1; - tvec_t tv2; - tvec_t tv3; - tvec_t tv4; - tvec_t tv5; -} ____cacheline_aligned_in_smp; - -typedef struct tvec_t_base_s tvec_base_t; - -struct tvec_base boot_tvec_bases __attribute__((unused)); - -static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) __attribute__((unused)) = &boot_tvec_bases; - -void init_timer(struct timer_list *timer) -{ - timer->ddekit_timer_id = DDEKIT_INVALID_TIMER_ID; -} - -void add_timer(struct timer_list *timer) -{ - CHECK_INITVAR(dde26_timer); - /* DDE2.6 uses jiffies and HZ as exported from L4IO. Therefore - * we just need to hand over the timeout to DDEKit. */ - timer->ddekit_timer_id = ddekit_add_timer((void *)timer->function, - (void *)timer->data, - timer->expires); -} - - -void add_timer_on(struct timer_list *timer, int cpu) -{ - add_timer(timer); -} - - -int del_timer(struct timer_list * timer) -{ - int ret; - CHECK_INITVAR(dde26_timer); - ret = ddekit_del_timer(timer->ddekit_timer_id); - timer->ddekit_timer_id = DDEKIT_INVALID_TIMER_ID; - - return ret >= 0; -} - -int del_timer_sync(struct timer_list *timer) -{ - return del_timer(timer); -} - - -int __mod_timer(struct timer_list *timer, unsigned long expires) -{ - /* XXX: Naive implementation. If we really need to be fast with - * this function, we can implement a faster version inside - * the DDEKit. Bjoern just does not think that this is the - * case. - */ - int r; - - CHECK_INITVAR(dde26_timer); - r = del_timer(timer); - - timer->expires = expires; - add_timer(timer); - - return (r > 0); -} - - -int mod_timer(struct timer_list *timer, unsigned long expires) -{ - return __mod_timer(timer, expires); -} - - -int timer_pending(const struct timer_list *timer) -{ - CHECK_INITVAR(dde26_timer); - /* There must be a valid DDEKit timer ID in the timer field - * *AND* it must be pending in the DDEKit. - */ - return ((timer->ddekit_timer_id != DDEKIT_INVALID_TIMER_ID) - && ddekit_timer_pending(timer->ddekit_timer_id)); -} - - -/** - * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for - */ -void msleep(unsigned int msecs) -{ - ddekit_thread_msleep(msecs); -} - - -void __const_udelay(unsigned long xloops) -{ - ddekit_thread_usleep(xloops); -} - - -void __udelay(unsigned long usecs) -{ - ddekit_thread_usleep(usecs); -} - - -void __ndelay(unsigned long nsecs) -{ - ddekit_thread_nsleep(nsecs); -} - - -void __init l4dde26_init_timers(void) -{ - ddekit_init_timers(); - - l4dde26_process_from_ddekit(ddekit_get_timer_thread()); - - INITIALIZE_INITVAR(dde26_timer); -} - -core_initcall(l4dde26_init_timers); - -extern unsigned long volatile __jiffy_data jiffies; - -__attribute__((weak)) void do_gettimeofday (struct timeval *tv) -{ - WARN_UNIMPL; -} - -struct timespec current_fs_time(struct super_block *sb) -{ - struct timespec now = {0,0}; - WARN_UNIMPL; - return now; -} - -ktime_t ktime_get_real(void) -{ - struct timespec now = {0,0}; - WARN_UNIMPL; - return timespec_to_ktime(now); -} - - -void native_io_delay(void) -{ - udelay(2); -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/vmalloc.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/vmalloc.c.svn-base deleted file mode 100644 index 134b80c3..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/vmalloc.c.svn-base +++ /dev/null @@ -1,30 +0,0 @@ -/****************************************************************************** - * Bjoern Doebel <doebel@tudos.org> * - * * - * (c) 2005 - 2007 Technische Universitaet Dresden * - * This file is part of DROPS, which is distributed under the terms of the * - * GNU General Public License 2. Please see the COPYING file for details. * - ******************************************************************************/ - -/* - * \brief vmalloc implementation - * \author Bjoern Doebel - * \date 2007-07-30 - */ - -/* Linux */ -#include <linux/vmalloc.h> - -/* DDEKit */ -#include <l4/dde/ddekit/memory.h> -#include <l4/dde/ddekit/lock.h> - -void *vmalloc(unsigned long size) -{ - return ddekit_simple_malloc(size); -} - -void vfree(const void *addr) -{ - ddekit_simple_free((void*)addr); -} diff --git a/libdde_linux26/lib/src/arch/l4/.svn/text-base/vmstat.c.svn-base b/libdde_linux26/lib/src/arch/l4/.svn/text-base/vmstat.c.svn-base deleted file mode 100644 index 2e87389e..00000000 --- a/libdde_linux26/lib/src/arch/l4/.svn/text-base/vmstat.c.svn-base +++ /dev/null @@ -1,34 +0,0 @@ -#include "local.h" - -#include <linux/fs.h> - -atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; - - -void dec_zone_page_state(struct page *page, enum zone_stat_item item) -{ - WARN_UNIMPL; -} - - -void inc_zone_page_state(struct page *page, enum zone_stat_item item) -{ - WARN_UNIMPL; -} - - -void __inc_zone_page_state(struct page *page, enum zone_stat_item item) -{ - WARN_UNIMPL; -} - -void __get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free, struct pglist_data *pgdat) -{ - WARN_UNIMPL; -} - -void __dec_zone_state(struct zone *zone, enum zone_stat_item item) -{ - WARN_UNIMPL; -} diff --git a/libdde_linux26/lib/src/arch/x86/.svn/all-wcprops b/libdde_linux26/lib/src/arch/x86/.svn/all-wcprops deleted file mode 100644 index 2db9a887..00000000 --- a/libdde_linux26/lib/src/arch/x86/.svn/all-wcprops +++ /dev/null @@ -1,5 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 67 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/x86 -END diff --git a/libdde_linux26/lib/src/arch/x86/.svn/entries b/libdde_linux26/lib/src/arch/x86/.svn/entries deleted file mode 100644 index cdbe1e1d..00000000 --- a/libdde_linux26/lib/src/arch/x86/.svn/entries +++ /dev/null @@ -1,31 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/arch/x86 -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -lib -dir - diff --git a/libdde_linux26/lib/src/arch/x86/.svn/format b/libdde_linux26/lib/src/arch/x86/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/arch/x86/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/arch/x86/lib/.svn/all-wcprops b/libdde_linux26/lib/src/arch/x86/lib/.svn/all-wcprops deleted file mode 100644 index 61d9e4b5..00000000 --- a/libdde_linux26/lib/src/arch/x86/lib/.svn/all-wcprops +++ /dev/null @@ -1,11 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 71 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/x86/lib -END -semaphore_32.S -K 25 -svn:wc:ra_dav:version-url -V 86 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/arch/x86/lib/semaphore_32.S -END diff --git a/libdde_linux26/lib/src/arch/x86/lib/.svn/entries b/libdde_linux26/lib/src/arch/x86/lib/.svn/entries deleted file mode 100644 index ee8219b2..00000000 --- a/libdde_linux26/lib/src/arch/x86/lib/.svn/entries +++ /dev/null @@ -1,62 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/arch/x86/lib -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -semaphore_32.S -file - - - - -2009-11-15T17:17:12.000000Z -8781a421c002516577c2888bc85b51e9 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -2859 - diff --git a/libdde_linux26/lib/src/arch/x86/lib/.svn/format b/libdde_linux26/lib/src/arch/x86/lib/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/arch/x86/lib/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/arch/x86/lib/.svn/text-base/semaphore_32.S.svn-base b/libdde_linux26/lib/src/arch/x86/lib/.svn/text-base/semaphore_32.S.svn-base deleted file mode 100644 index 1850ca50..00000000 --- a/libdde_linux26/lib/src/arch/x86/lib/.svn/text-base/semaphore_32.S.svn-base +++ /dev/null @@ -1,138 +0,0 @@ -/* - * i386 semaphore implementation. - * - * (C) Copyright 1999 Linus Torvalds - * - * Portions Copyright 1999 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> - */ - -#include <linux/linkage.h> -#include <asm/rwlock.h> -#include <asm/alternative-asm.h> -#include <asm/frame.h> -#include <asm/dwarf2.h> - -/* - * The semaphore operations have a special calling sequence that - * allow us to do a simpler in-line version of them. These routines - * need to convert that sequence back into the C sequence when - * there is contention on the semaphore. - * - * %eax contains the semaphore pointer on entry. Save the C-clobbered - * registers (%eax, %edx and %ecx) except %eax whish is either a return - * value or just clobbered.. - */ -#ifndef DDE_LINUX - .section .sched.text, "ax" -#endif - -/* - * rw spinlock fallbacks - */ -#ifdef CONFIG_SMP -ENTRY(__write_lock_failed) - CFI_STARTPROC simple - FRAME -2: LOCK_PREFIX - addl $ RW_LOCK_BIAS,(%eax) -1: rep; nop - cmpl $ RW_LOCK_BIAS,(%eax) - jne 1b - LOCK_PREFIX - subl $ RW_LOCK_BIAS,(%eax) - jnz 2b - ENDFRAME - ret - CFI_ENDPROC - ENDPROC(__write_lock_failed) - -ENTRY(__read_lock_failed) - CFI_STARTPROC - FRAME -2: LOCK_PREFIX - incl (%eax) -1: rep; nop - cmpl $1,(%eax) - js 1b - LOCK_PREFIX - decl (%eax) - js 2b - ENDFRAME - ret - CFI_ENDPROC - ENDPROC(__read_lock_failed) - -#endif - -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM - -/* Fix up special calling conventions */ -ENTRY(call_rwsem_down_read_failed) - CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - push %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx,0 - call rwsem_down_read_failed - pop %edx - CFI_ADJUST_CFA_OFFSET -4 - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 - ret - CFI_ENDPROC - ENDPROC(call_rwsem_down_read_failed) - -ENTRY(call_rwsem_down_write_failed) - CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - calll rwsem_down_write_failed - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 - ret - CFI_ENDPROC - ENDPROC(call_rwsem_down_write_failed) - -ENTRY(call_rwsem_wake) - CFI_STARTPROC - decw %dx /* do nothing if still outstanding active readers */ - jnz 1f - push %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - call rwsem_wake - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 -1: ret - CFI_ENDPROC - ENDPROC(call_rwsem_wake) - -/* Fix up special calling conventions */ -ENTRY(call_rwsem_downgrade_wake) - CFI_STARTPROC - push %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx,0 - push %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx,0 - call rwsem_downgrade_wake - pop %edx - CFI_ADJUST_CFA_OFFSET -4 - pop %ecx - CFI_ADJUST_CFA_OFFSET -4 - ret - CFI_ENDPROC - ENDPROC(call_rwsem_downgrade_wake) - -#endif diff --git a/libdde_linux26/lib/src/block/.svn/all-wcprops b/libdde_linux26/lib/src/block/.svn/all-wcprops deleted file mode 100644 index e426ee37..00000000 --- a/libdde_linux26/lib/src/block/.svn/all-wcprops +++ /dev/null @@ -1,23 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 64 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/block -END -blk-core.c -K 25 -svn:wc:ra_dav:version-url -V 75 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/block/blk-core.c -END -genhd.c -K 25 -svn:wc:ra_dav:version-url -V 72 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/block/genhd.c -END -blk.h -K 25 -svn:wc:ra_dav:version-url -V 70 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/block/blk.h -END diff --git a/libdde_linux26/lib/src/block/.svn/entries b/libdde_linux26/lib/src/block/.svn/entries deleted file mode 100644 index ecaca048..00000000 --- a/libdde_linux26/lib/src/block/.svn/entries +++ /dev/null @@ -1,130 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/block -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -blk-core.c -file - - - - -2009-11-15T17:17:12.000000Z -d0942aa32e112472f0be78922e474dd6 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -59403 - -genhd.c -file - - - - -2009-11-15T17:17:12.000000Z -38a5b5fd16bffbc4ffd2401ed87c6e94 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -29312 - -blk.h -file - - - - -2009-11-15T17:17:12.000000Z -112454538cc2f824e270da8ef24e3e0b -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -3294 - diff --git a/libdde_linux26/lib/src/block/.svn/format b/libdde_linux26/lib/src/block/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/block/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/block/.svn/text-base/blk-core.c.svn-base b/libdde_linux26/lib/src/block/.svn/text-base/blk-core.c.svn-base deleted file mode 100644 index 92241e50..00000000 --- a/libdde_linux26/lib/src/block/.svn/text-base/blk-core.c.svn-base +++ /dev/null @@ -1,2173 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 1994, Karl Keyte: Added support for disk statistics - * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE - * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> - * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - * - July2000 - * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 - */ - -/* - * This handles all read/write requests to block devices - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/backing-dev.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/highmem.h> -#include <linux/mm.h> -#include <linux/kernel_stat.h> -#include <linux/string.h> -#include <linux/init.h> -#include <linux/completion.h> -#include <linux/slab.h> -#include <linux/swap.h> -#include <linux/writeback.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/blktrace_api.h> -#include <linux/fault-inject.h> -#include <trace/block.h> - -#include "blk.h" - -DEFINE_TRACE(block_plug); -DEFINE_TRACE(block_unplug_io); -DEFINE_TRACE(block_unplug_timer); -DEFINE_TRACE(block_getrq); -DEFINE_TRACE(block_sleeprq); -DEFINE_TRACE(block_rq_requeue); -DEFINE_TRACE(block_bio_backmerge); -DEFINE_TRACE(block_bio_frontmerge); -DEFINE_TRACE(block_bio_queue); -DEFINE_TRACE(block_rq_complete); -DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */ -EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); - -static int __make_request(struct request_queue *q, struct bio *bio); - -/* - * For the allocated request tables - */ -static struct kmem_cache *request_cachep; - -/* - * For queue allocation - */ -struct kmem_cache *blk_requestq_cachep; - -/* - * Controlling structure to kblockd - */ -static struct workqueue_struct *kblockd_workqueue; - -static void drive_stat_acct(struct request *rq, int new_io) -{ - struct gendisk *disk = rq->rq_disk; - struct hd_struct *part; - int rw = rq_data_dir(rq); - int cpu; - - if (!blk_fs_request(rq) || !disk || !blk_do_io_stat(disk->queue)) - return; - - cpu = part_stat_lock(); - part = disk_map_sector_rcu(rq->rq_disk, rq->sector); - - if (!new_io) - part_stat_inc(cpu, part, merges[rw]); - else { - part_round_stats(cpu, part); - part_inc_in_flight(part); - } - - part_stat_unlock(); -} - -void blk_queue_congestion_threshold(struct request_queue *q) -{ - int nr; - - nr = q->nr_requests - (q->nr_requests / 8) + 1; - if (nr > q->nr_requests) - nr = q->nr_requests; - q->nr_congestion_on = nr; - - nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; - if (nr < 1) - nr = 1; - q->nr_congestion_off = nr; -} - -/** - * blk_get_backing_dev_info - get the address of a queue's backing_dev_info - * @bdev: device - * - * Locates the passed device's request queue and returns the address of its - * backing_dev_info - * - * Will return NULL if the request queue cannot be located. - */ -struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) -{ - struct backing_dev_info *ret = NULL; - struct request_queue *q = bdev_get_queue(bdev); - - if (q) - ret = &q->backing_dev_info; - return ret; -} -EXPORT_SYMBOL(blk_get_backing_dev_info); - -void blk_rq_init(struct request_queue *q, struct request *rq) -{ - memset(rq, 0, sizeof(*rq)); - - INIT_LIST_HEAD(&rq->queuelist); - INIT_LIST_HEAD(&rq->timeout_list); - rq->cpu = -1; - rq->q = q; - rq->sector = rq->hard_sector = (sector_t) -1; - INIT_HLIST_NODE(&rq->hash); - RB_CLEAR_NODE(&rq->rb_node); - rq->cmd = rq->__cmd; - rq->tag = -1; - rq->ref_count = 1; -} -EXPORT_SYMBOL(blk_rq_init); - -static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, int error) -{ - struct request_queue *q = rq->q; - - if (&q->bar_rq != rq) { - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; - - if (unlikely(nbytes > bio->bi_size)) { - printk(KERN_ERR "%s: want %u bytes done, %u left\n", - __func__, nbytes, bio->bi_size); - nbytes = bio->bi_size; - } - - if (unlikely(rq->cmd_flags & REQ_QUIET)) - set_bit(BIO_QUIET, &bio->bi_flags); - - bio->bi_size -= nbytes; - bio->bi_sector += (nbytes >> 9); - - if (bio_integrity(bio)) - bio_integrity_advance(bio, nbytes); - - if (bio->bi_size == 0) - bio_endio(bio, error); - } else { - - /* - * Okay, this is the barrier request in progress, just - * record the error; - */ - if (error && !q->orderr) - q->orderr = error; - } -} - -void blk_dump_rq_flags(struct request *rq, char *msg) -{ - int bit; - - printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, - rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, - rq->cmd_flags); - - printk(KERN_INFO " sector %llu, nr/cnr %lu/%u\n", - (unsigned long long)rq->sector, - rq->nr_sectors, - rq->current_nr_sectors); - printk(KERN_INFO " bio %p, biotail %p, buffer %p, data %p, len %u\n", - rq->bio, rq->biotail, - rq->buffer, rq->data, - rq->data_len); - - if (blk_pc_request(rq)) { - printk(KERN_INFO " cdb: "); - for (bit = 0; bit < BLK_MAX_CDB; bit++) - printk("%02x ", rq->cmd[bit]); - printk("\n"); - } -} -EXPORT_SYMBOL(blk_dump_rq_flags); - -/* - * "plug" the device if there are no outstanding requests: this will - * force the transfer to start only after we have put all the requests - * on the list. - * - * This is called with interrupts off and no requests on the queue and - * with the queue lock held. - */ -void blk_plug_device(struct request_queue *q) -{ - WARN_ON(!irqs_disabled()); - - /* - * don't plug a stopped queue, it must be paired with blk_start_queue() - * which will restart the queueing - */ - if (blk_queue_stopped(q)) - return; - - if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { - mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); - trace_block_plug(q); - } -} -EXPORT_SYMBOL(blk_plug_device); - -/** - * blk_plug_device_unlocked - plug a device without queue lock held - * @q: The &struct request_queue to plug - * - * Description: - * Like @blk_plug_device(), but grabs the queue lock and disables - * interrupts. - **/ -void blk_plug_device_unlocked(struct request_queue *q) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - blk_plug_device(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} -EXPORT_SYMBOL(blk_plug_device_unlocked); - -/* - * remove the queue from the plugged list, if present. called with - * queue lock held and interrupts disabled. - */ -int blk_remove_plug(struct request_queue *q) -{ - WARN_ON(!irqs_disabled()); - - if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) - return 0; - - del_timer(&q->unplug_timer); - return 1; -} -EXPORT_SYMBOL(blk_remove_plug); - -/* - * remove the plug and let it rip.. - */ -void __generic_unplug_device(struct request_queue *q) -{ - if (unlikely(blk_queue_stopped(q))) - return; - if (!blk_remove_plug(q) && !blk_queue_nonrot(q)) - return; - - q->request_fn(q); -} - -/** - * generic_unplug_device - fire a request queue - * @q: The &struct request_queue in question - * - * Description: - * Linux uses plugging to build bigger requests queues before letting - * the device have at them. If a queue is plugged, the I/O scheduler - * is still adding and merging requests on the queue. Once the queue - * gets unplugged, the request_fn defined for the queue is invoked and - * transfers started. - **/ -void generic_unplug_device(struct request_queue *q) -{ - if (blk_queue_plugged(q)) { - spin_lock_irq(q->queue_lock); - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); - } -} -EXPORT_SYMBOL(generic_unplug_device); - -static void blk_backing_dev_unplug(struct backing_dev_info *bdi, - struct page *page) -{ - struct request_queue *q = bdi->unplug_io_data; - - blk_unplug(q); -} - -void blk_unplug_work(struct work_struct *work) -{ - struct request_queue *q = - container_of(work, struct request_queue, unplug_work); - - trace_block_unplug_io(q); - q->unplug_fn(q); -} - -void blk_unplug_timeout(unsigned long data) -{ - struct request_queue *q = (struct request_queue *)data; - - trace_block_unplug_timer(q); - kblockd_schedule_work(q, &q->unplug_work); -} - -void blk_unplug(struct request_queue *q) -{ - /* - * devices don't necessarily have an ->unplug_fn defined - */ - if (q->unplug_fn) { - trace_block_unplug_io(q); - q->unplug_fn(q); - } -} -EXPORT_SYMBOL(blk_unplug); - -static void blk_invoke_request_fn(struct request_queue *q) -{ - if (unlikely(blk_queue_stopped(q))) - return; - - /* - * one level of recursion is ok and is much faster than kicking - * the unplug handling - */ - if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { - q->request_fn(q); - queue_flag_clear(QUEUE_FLAG_REENTER, q); - } else { - queue_flag_set(QUEUE_FLAG_PLUGGED, q); - kblockd_schedule_work(q, &q->unplug_work); - } -} - -/** - * blk_start_queue - restart a previously stopped queue - * @q: The &struct request_queue in question - * - * Description: - * blk_start_queue() will clear the stop flag on the queue, and call - * the request_fn for the queue if it was in a stopped state when - * entered. Also see blk_stop_queue(). Queue lock must be held. - **/ -void blk_start_queue(struct request_queue *q) -{ - WARN_ON(!irqs_disabled()); - - queue_flag_clear(QUEUE_FLAG_STOPPED, q); - blk_invoke_request_fn(q); -} -EXPORT_SYMBOL(blk_start_queue); - -/** - * blk_stop_queue - stop a queue - * @q: The &struct request_queue in question - * - * Description: - * The Linux block layer assumes that a block driver will consume all - * entries on the request queue when the request_fn strategy is called. - * Often this will not happen, because of hardware limitations (queue - * depth settings). If a device driver gets a 'queue full' response, - * or if it simply chooses not to queue more I/O at one point, it can - * call this function to prevent the request_fn from being called until - * the driver has signalled it's ready to go again. This happens by calling - * blk_start_queue() to restart queue operations. Queue lock must be held. - **/ -void blk_stop_queue(struct request_queue *q) -{ - blk_remove_plug(q); - queue_flag_set(QUEUE_FLAG_STOPPED, q); -} -EXPORT_SYMBOL(blk_stop_queue); - -/** - * blk_sync_queue - cancel any pending callbacks on a queue - * @q: the queue - * - * Description: - * The block layer may perform asynchronous callback activity - * on a queue, such as calling the unplug function after a timeout. - * A block device may call blk_sync_queue to ensure that any - * such activity is cancelled, thus allowing it to release resources - * that the callbacks might use. The caller must already have made sure - * that its ->make_request_fn will not re-add plugging prior to calling - * this function. - * - */ -void blk_sync_queue(struct request_queue *q) -{ - del_timer_sync(&q->unplug_timer); - del_timer_sync(&q->timeout); - cancel_work_sync(&q->unplug_work); -} -EXPORT_SYMBOL(blk_sync_queue); - -/** - * __blk_run_queue - run a single device queue - * @q: The queue to run - * - * Description: - * See @blk_run_queue. This variant must be called with the queue lock - * held and interrupts disabled. - * - */ -void __blk_run_queue(struct request_queue *q) -{ - blk_remove_plug(q); - - /* - * Only recurse once to avoid overrunning the stack, let the unplug - * handling reinvoke the handler shortly if we already got there. - */ - if (!elv_queue_empty(q)) - blk_invoke_request_fn(q); -} -EXPORT_SYMBOL(__blk_run_queue); - -/** - * blk_run_queue - run a single device queue - * @q: The queue to run - * - * Description: - * Invoke request handling on this queue, if it has pending work to do. - * May be used to restart queueing when a request has completed. Also - * See @blk_start_queueing. - * - */ -void blk_run_queue(struct request_queue *q) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - __blk_run_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} -EXPORT_SYMBOL(blk_run_queue); - -void blk_put_queue(struct request_queue *q) -{ - kobject_put(&q->kobj); -} - -void blk_cleanup_queue(struct request_queue *q) -{ - /* - * We know we have process context here, so we can be a little - * cautious and ensure that pending block actions on this device - * are done before moving on. Going into this function, we should - * not have processes doing IO to this device. - */ - blk_sync_queue(q); - - mutex_lock(&q->sysfs_lock); - queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); - mutex_unlock(&q->sysfs_lock); - - if (q->elevator) - elevator_exit(q->elevator); - - blk_put_queue(q); -} -EXPORT_SYMBOL(blk_cleanup_queue); - -static int blk_init_free_list(struct request_queue *q) -{ - struct request_list *rl = &q->rq; - - rl->count[READ] = rl->count[WRITE] = 0; - rl->starved[READ] = rl->starved[WRITE] = 0; - rl->elvpriv = 0; - init_waitqueue_head(&rl->wait[READ]); - init_waitqueue_head(&rl->wait[WRITE]); - - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, request_cachep, q->node); - - if (!rl->rq_pool) - return -ENOMEM; - - return 0; -} - -struct request_queue *blk_alloc_queue(gfp_t gfp_mask) -{ - return blk_alloc_queue_node(gfp_mask, -1); -} -EXPORT_SYMBOL(blk_alloc_queue); - -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) -{ - struct request_queue *q; - int err; - - q = kmem_cache_alloc_node(blk_requestq_cachep, - gfp_mask | __GFP_ZERO, node_id); - if (!q) - return NULL; - - q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; - q->backing_dev_info.unplug_io_data = q; - err = bdi_init(&q->backing_dev_info); - if (err) { - kmem_cache_free(blk_requestq_cachep, q); - return NULL; - } - - init_timer(&q->unplug_timer); - setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); - INIT_LIST_HEAD(&q->timeout_list); - INIT_WORK(&q->unplug_work, blk_unplug_work); - - kobject_init(&q->kobj, &blk_queue_ktype); - - mutex_init(&q->sysfs_lock); - spin_lock_init(&q->__queue_lock); - - return q; -} -EXPORT_SYMBOL(blk_alloc_queue_node); - -/** - * blk_init_queue - prepare a request queue for use with a block device - * @rfn: The function to be called to process requests that have been - * placed on the queue. - * @lock: Request queue spin lock - * - * Description: - * If a block device wishes to use the standard request handling procedures, - * which sorts requests and coalesces adjacent requests, then it must - * call blk_init_queue(). The function @rfn will be called when there - * are requests on the queue that need to be processed. If the device - * supports plugging, then @rfn may not be called immediately when requests - * are available on the queue, but may be called at some time later instead. - * Plugged queues are generally unplugged when a buffer belonging to one - * of the requests on the queue is needed, or due to memory pressure. - * - * @rfn is not required, or even expected, to remove all requests off the - * queue, but only as many as it can handle at a time. If it does leave - * requests on the queue, it is responsible for arranging that the requests - * get dealt with eventually. - * - * The queue spin lock must be held while manipulating the requests on the - * request queue; this lock will be taken also from interrupt context, so irq - * disabling is needed for it. - * - * Function returns a pointer to the initialized request queue, or %NULL if - * it didn't succeed. - * - * Note: - * blk_init_queue() must be paired with a blk_cleanup_queue() call - * when the block device is deactivated (such as at module unload). - **/ - -struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) -{ - return blk_init_queue_node(rfn, lock, -1); -} -EXPORT_SYMBOL(blk_init_queue); - -struct request_queue * -blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) -{ - struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); - - if (!q) - return NULL; - - q->node = node_id; - if (blk_init_free_list(q)) { - kmem_cache_free(blk_requestq_cachep, q); - return NULL; - } - - /* - * if caller didn't supply a lock, they get per-queue locking with - * our embedded lock - */ - if (!lock) - lock = &q->__queue_lock; - - q->request_fn = rfn; - q->prep_rq_fn = NULL; - q->unplug_fn = generic_unplug_device; - q->queue_flags = QUEUE_FLAG_DEFAULT; - q->queue_lock = lock; - - blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK); - - blk_queue_make_request(q, __make_request); - blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); - - blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); - blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); - - q->sg_reserved_size = INT_MAX; - - blk_set_cmd_filter_defaults(&q->cmd_filter); - - /* - * all done - */ - if (!elevator_init(q, NULL)) { - blk_queue_congestion_threshold(q); - return q; - } - - blk_put_queue(q); - return NULL; -} -EXPORT_SYMBOL(blk_init_queue_node); - -int blk_get_queue(struct request_queue *q) -{ - if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { - kobject_get(&q->kobj); - return 0; - } - - return 1; -} - -static inline void blk_free_request(struct request_queue *q, struct request *rq) -{ - if (rq->cmd_flags & REQ_ELVPRIV) - elv_put_request(q, rq); - mempool_free(rq, q->rq.rq_pool); -} - -static struct request * -blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) -{ - struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); - - if (!rq) - return NULL; - - blk_rq_init(q, rq); - - rq->cmd_flags = rw | REQ_ALLOCED; - - if (priv) { - if (unlikely(elv_set_request(q, rq, gfp_mask))) { - mempool_free(rq, q->rq.rq_pool); - return NULL; - } - rq->cmd_flags |= REQ_ELVPRIV; - } - - return rq; -} - -/* - * ioc_batching returns true if the ioc is a valid batching request and - * should be given priority access to a request. - */ -static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) -{ - if (!ioc) - return 0; - - /* - * Make sure the process is able to allocate at least 1 request - * even if the batch times out, otherwise we could theoretically - * lose wakeups. - */ - return ioc->nr_batch_requests == q->nr_batching || - (ioc->nr_batch_requests > 0 - && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); -} - -/* - * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This - * will cause the process to be a "batcher" on all queues in the system. This - * is the behaviour we want though - once it gets a wakeup it should be given - * a nice run. - */ -static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) -{ - if (!ioc || ioc_batching(q, ioc)) - return; - - ioc->nr_batch_requests = q->nr_batching; - ioc->last_waited = jiffies; -} - -static void __freed_request(struct request_queue *q, int rw) -{ - struct request_list *rl = &q->rq; - - if (rl->count[rw] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, rw); - - if (rl->count[rw] + 1 <= q->nr_requests) { - if (waitqueue_active(&rl->wait[rw])) - wake_up(&rl->wait[rw]); - - blk_clear_queue_full(q, rw); - } -} - -/* - * A request has just been released. Account for it, update the full and - * congestion status, wake up any waiters. Called under q->queue_lock. - */ -static void freed_request(struct request_queue *q, int rw, int priv) -{ - struct request_list *rl = &q->rq; - - rl->count[rw]--; - if (priv) - rl->elvpriv--; - - __freed_request(q, rw); - - if (unlikely(rl->starved[rw ^ 1])) - __freed_request(q, rw ^ 1); -} - -#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) -/* - * Get a free request, queue_lock must be held. - * Returns NULL on failure, with queue_lock held. - * Returns !NULL on success, with queue_lock *not held*. - */ -static struct request *get_request(struct request_queue *q, int rw_flags, - struct bio *bio, gfp_t gfp_mask) -{ - struct request *rq = NULL; - struct request_list *rl = &q->rq; - struct io_context *ioc = NULL; - const int rw = rw_flags & 0x01; - int may_queue, priv; - - may_queue = elv_may_queue(q, rw_flags); - if (may_queue == ELV_MQUEUE_NO) - goto rq_starved; - - if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { - if (rl->count[rw]+1 >= q->nr_requests) { - ioc = current_io_context(GFP_ATOMIC, q->node); - /* - * The queue will fill after this allocation, so set - * it as full, and mark this process as "batching". - * This process will be allowed to complete a batch of - * requests, others will be blocked. - */ - if (!blk_queue_full(q, rw)) { - ioc_set_batching(q, ioc); - blk_set_queue_full(q, rw); - } else { - if (may_queue != ELV_MQUEUE_MUST - && !ioc_batching(q, ioc)) { - /* - * The queue is full and the allocating - * process is not a "batcher", and not - * exempted by the IO scheduler - */ - goto out; - } - } - } - blk_set_queue_congested(q, rw); - } - - /* - * Only allow batching queuers to allocate up to 50% over the defined - * limit of requests, otherwise we could have thousands of requests - * allocated with any setting of ->nr_requests - */ - if (rl->count[rw] >= (3 * q->nr_requests / 2)) - goto out; - - rl->count[rw]++; - rl->starved[rw] = 0; - - priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) - rl->elvpriv++; - - spin_unlock_irq(q->queue_lock); - - rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); - if (unlikely(!rq)) { - /* - * Allocation failed presumably due to memory. Undo anything - * we might have messed up. - * - * Allocating task should really be put onto the front of the - * wait queue, but this is pretty rare. - */ - spin_lock_irq(q->queue_lock); - freed_request(q, rw, priv); - - /* - * in the very unlikely event that allocation failed and no - * requests for this direction was pending, mark us starved - * so that freeing of a request in the other direction will - * notice us. another possible fix would be to split the - * rq mempool into READ and WRITE - */ -rq_starved: - if (unlikely(rl->count[rw] == 0)) - rl->starved[rw] = 1; - - goto out; - } - - /* - * ioc may be NULL here, and ioc_batching will be false. That's - * OK, if the queue is under the request limit then requests need - * not count toward the nr_batch_requests limit. There will always - * be some limit enforced by BLK_BATCH_TIME. - */ - if (ioc_batching(q, ioc)) - ioc->nr_batch_requests--; - - trace_block_getrq(q, bio, rw); -out: - return rq; -} - -/* - * No available requests for this queue, unplug the device and wait for some - * requests to become available. - * - * Called with q->queue_lock held, and returns with it unlocked. - */ -static struct request *get_request_wait(struct request_queue *q, int rw_flags, - struct bio *bio) -{ - const int rw = rw_flags & 0x01; - struct request *rq; - - rq = get_request(q, rw_flags, bio, GFP_NOIO); - while (!rq) { - DEFINE_WAIT(wait); - struct io_context *ioc; - struct request_list *rl = &q->rq; - - prepare_to_wait_exclusive(&rl->wait[rw], &wait, - TASK_UNINTERRUPTIBLE); - - trace_block_sleeprq(q, bio, rw); - - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); - io_schedule(); - - /* - * After sleeping, we become a "batching" process and - * will be able to allocate at least one request, and - * up to a big batch of them for a small period time. - * See ioc_batching, ioc_set_batching - */ - ioc = current_io_context(GFP_NOIO, q->node); - ioc_set_batching(q, ioc); - - spin_lock_irq(q->queue_lock); - finish_wait(&rl->wait[rw], &wait); - - rq = get_request(q, rw_flags, bio, GFP_NOIO); - }; - - return rq; -} - -struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) -{ - struct request *rq; - - BUG_ON(rw != READ && rw != WRITE); - - spin_lock_irq(q->queue_lock); - if (gfp_mask & __GFP_WAIT) { - rq = get_request_wait(q, rw, NULL); - } else { - rq = get_request(q, rw, NULL, gfp_mask); - if (!rq) - spin_unlock_irq(q->queue_lock); - } - /* q->queue_lock is unlocked at this point */ - - return rq; -} -EXPORT_SYMBOL(blk_get_request); - -/** - * blk_start_queueing - initiate dispatch of requests to device - * @q: request queue to kick into gear - * - * This is basically a helper to remove the need to know whether a queue - * is plugged or not if someone just wants to initiate dispatch of requests - * for this queue. Should be used to start queueing on a device outside - * of ->request_fn() context. Also see @blk_run_queue. - * - * The queue lock must be held with interrupts disabled. - */ -void blk_start_queueing(struct request_queue *q) -{ - if (!blk_queue_plugged(q)) { - if (unlikely(blk_queue_stopped(q))) - return; - q->request_fn(q); - } else - __generic_unplug_device(q); -} -EXPORT_SYMBOL(blk_start_queueing); - -/** - * blk_requeue_request - put a request back on queue - * @q: request queue where request should be inserted - * @rq: request to be inserted - * - * Description: - * Drivers often keep queueing requests until the hardware cannot accept - * more, when that condition happens we need to put the request back - * on the queue. Must be called with queue lock held. - */ -void blk_requeue_request(struct request_queue *q, struct request *rq) -{ - blk_delete_timer(rq); - blk_clear_rq_complete(rq); - trace_block_rq_requeue(q, rq); - - if (blk_rq_tagged(rq)) - blk_queue_end_tag(q, rq); - - elv_requeue_request(q, rq); -} -EXPORT_SYMBOL(blk_requeue_request); - -/** - * blk_insert_request - insert a special request into a request queue - * @q: request queue where request should be inserted - * @rq: request to be inserted - * @at_head: insert request at head or tail of queue - * @data: private data - * - * Description: - * Many block devices need to execute commands asynchronously, so they don't - * block the whole kernel from preemption during request execution. This is - * accomplished normally by inserting aritficial requests tagged as - * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them - * be scheduled for actual execution by the request queue. - * - * We have the option of inserting the head or the tail of the queue. - * Typically we use the tail for new ioctls and so forth. We use the head - * of the queue for things like a QUEUE_FULL message from a device, or a - * host that is unable to accept a particular command. - */ -void blk_insert_request(struct request_queue *q, struct request *rq, - int at_head, void *data) -{ - int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - unsigned long flags; - - /* - * tell I/O scheduler that this isn't a regular read/write (ie it - * must not attempt merges on this) and that it acts as a soft - * barrier - */ - rq->cmd_type = REQ_TYPE_SPECIAL; - rq->cmd_flags |= REQ_SOFTBARRIER; - - rq->special = data; - - spin_lock_irqsave(q->queue_lock, flags); - - /* - * If command is tagged, release the tag - */ - if (blk_rq_tagged(rq)) - blk_queue_end_tag(q, rq); - - drive_stat_acct(rq, 1); - __elv_add_request(q, rq, where, 0); - blk_start_queueing(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} -EXPORT_SYMBOL(blk_insert_request); - -/* - * add-request adds a request to the linked list. - * queue lock is held and interrupts disabled, as we muck with the - * request queue list. - */ -static inline void add_request(struct request_queue *q, struct request *req) -{ - drive_stat_acct(req, 1); - - /* - * elevator indicated where it wants this request to be - * inserted at elevator_merge time - */ - __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); -} - -static void part_round_stats_single(int cpu, struct hd_struct *part, - unsigned long now) -{ - if (now == part->stamp) - return; - - if (part->in_flight) { - __part_stat_add(cpu, part, time_in_queue, - part->in_flight * (now - part->stamp)); - __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); - } - part->stamp = now; -} - -/** - * part_round_stats() - Round off the performance stats on a struct disk_stats. - * @cpu: cpu number for stats access - * @part: target partition - * - * The average IO queue length and utilisation statistics are maintained - * by observing the current state of the queue length and the amount of - * time it has been in this state for. - * - * Normally, that accounting is done on IO completion, but that can result - * in more than a second's worth of IO being accounted for within any one - * second, leading to >100% utilisation. To deal with that, we call this - * function to do a round-off before returning the results when reading - * /proc/diskstats. This accounts immediately for all queue usage up to - * the current jiffies and restarts the counters again. - */ -void part_round_stats(int cpu, struct hd_struct *part) -{ - unsigned long now = jiffies; - - if (part->partno) - part_round_stats_single(cpu, &part_to_disk(part)->part0, now); - part_round_stats_single(cpu, part, now); -} -EXPORT_SYMBOL_GPL(part_round_stats); - -/* - * queue lock must be held - */ -void __blk_put_request(struct request_queue *q, struct request *req) -{ - if (unlikely(!q)) - return; - if (unlikely(--req->ref_count)) - return; - - elv_completed_request(q, req); - - /* - * Request may not have originated from ll_rw_blk. if not, - * it didn't come out of our reserved rq pools - */ - if (req->cmd_flags & REQ_ALLOCED) { - int rw = rq_data_dir(req); - int priv = req->cmd_flags & REQ_ELVPRIV; - - BUG_ON(!list_empty(&req->queuelist)); - BUG_ON(!hlist_unhashed(&req->hash)); - - blk_free_request(q, req); - freed_request(q, rw, priv); - } -} -EXPORT_SYMBOL_GPL(__blk_put_request); - -void blk_put_request(struct request *req) -{ - unsigned long flags; - struct request_queue *q = req->q; - - spin_lock_irqsave(q->queue_lock, flags); - __blk_put_request(q, req); - spin_unlock_irqrestore(q->queue_lock, flags); -} -EXPORT_SYMBOL(blk_put_request); - -void init_request_from_bio(struct request *req, struct bio *bio) -{ - req->cpu = bio->bi_comp_cpu; - req->cmd_type = REQ_TYPE_FS; - - /* - * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) - */ - if (bio_rw_ahead(bio)) - req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | - REQ_FAILFAST_DRIVER); - if (bio_failfast_dev(bio)) - req->cmd_flags |= REQ_FAILFAST_DEV; - if (bio_failfast_transport(bio)) - req->cmd_flags |= REQ_FAILFAST_TRANSPORT; - if (bio_failfast_driver(bio)) - req->cmd_flags |= REQ_FAILFAST_DRIVER; - - /* - * REQ_BARRIER implies no merging, but lets make it explicit - */ - if (unlikely(bio_discard(bio))) { - req->cmd_flags |= REQ_DISCARD; - if (bio_barrier(bio)) - req->cmd_flags |= REQ_SOFTBARRIER; - req->q->prepare_discard_fn(req->q, req); - } else if (unlikely(bio_barrier(bio))) - req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); - - if (bio_sync(bio)) - req->cmd_flags |= REQ_RW_SYNC; - if (bio_unplug(bio)) - req->cmd_flags |= REQ_UNPLUG; - if (bio_rw_meta(bio)) - req->cmd_flags |= REQ_RW_META; - - req->errors = 0; - req->hard_sector = req->sector = bio->bi_sector; - req->ioprio = bio_prio(bio); - req->start_time = jiffies; - blk_rq_bio_prep(req->q, req, bio); -} - -static int __make_request(struct request_queue *q, struct bio *bio) -{ - struct request *req; - int el_ret, nr_sectors; - const unsigned short prio = bio_prio(bio); - const int sync = bio_sync(bio); - const int unplug = bio_unplug(bio); - int rw_flags; - - nr_sectors = bio_sectors(bio); - - /* - * low level driver can indicate that it wants pages above a - * certain limit bounced to low memory (ie for highmem, or even - * ISA dma in theory) - */ - blk_queue_bounce(q, &bio); - - spin_lock_irq(q->queue_lock); - - if (unlikely(bio_barrier(bio)) || elv_queue_empty(q)) - goto get_rq; - - el_ret = elv_merge(q, &req, bio); - switch (el_ret) { - case ELEVATOR_BACK_MERGE: - BUG_ON(!rq_mergeable(req)); - - if (!ll_back_merge_fn(q, req, bio)) - break; - - trace_block_bio_backmerge(q, bio); - - req->biotail->bi_next = bio; - req->biotail = bio; - req->nr_sectors = req->hard_nr_sectors += nr_sectors; - req->ioprio = ioprio_best(req->ioprio, prio); - if (!blk_rq_cpu_valid(req)) - req->cpu = bio->bi_comp_cpu; - drive_stat_acct(req, 0); - if (!attempt_back_merge(q, req)) - elv_merged_request(q, req, el_ret); - goto out; - - case ELEVATOR_FRONT_MERGE: - BUG_ON(!rq_mergeable(req)); - - if (!ll_front_merge_fn(q, req, bio)) - break; - - trace_block_bio_frontmerge(q, bio); - - bio->bi_next = req->bio; - req->bio = bio; - - /* - * may not be valid. if the low level driver said - * it didn't need a bounce buffer then it better - * not touch req->buffer either... - */ - req->buffer = bio_data(bio); - req->current_nr_sectors = bio_cur_sectors(bio); - req->hard_cur_sectors = req->current_nr_sectors; - req->sector = req->hard_sector = bio->bi_sector; - req->nr_sectors = req->hard_nr_sectors += nr_sectors; - req->ioprio = ioprio_best(req->ioprio, prio); - if (!blk_rq_cpu_valid(req)) - req->cpu = bio->bi_comp_cpu; - drive_stat_acct(req, 0); - if (!attempt_front_merge(q, req)) - elv_merged_request(q, req, el_ret); - goto out; - - /* ELV_NO_MERGE: elevator says don't/can't merge. */ - default: - ; - } - -get_rq: - /* - * This sync check and mask will be re-done in init_request_from_bio(), - * but we need to set it earlier to expose the sync flag to the - * rq allocator and io schedulers. - */ - rw_flags = bio_data_dir(bio); - if (sync) - rw_flags |= REQ_RW_SYNC; - - /* - * Grab a free request. This is might sleep but can not fail. - * Returns with the queue unlocked. - */ - req = get_request_wait(q, rw_flags, bio); - - /* - * After dropping the lock and possibly sleeping here, our request - * may now be mergeable after it had proven unmergeable (above). - * We don't worry about that case for efficiency. It won't happen - * often, and the elevators are able to handle it. - */ - init_request_from_bio(req, bio); - - spin_lock_irq(q->queue_lock); - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || - bio_flagged(bio, BIO_CPU_AFFINE)) - req->cpu = blk_cpu_to_group(smp_processor_id()); - if (!blk_queue_nonrot(q) && elv_queue_empty(q)) - blk_plug_device(q); - add_request(q, req); -out: - if (unplug || blk_queue_nonrot(q)) - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); - return 0; -} - -/* - * If bio->bi_dev is a partition, remap the location - */ -static inline void blk_partition_remap(struct bio *bio) -{ - struct block_device *bdev = bio->bi_bdev; - - if (bio_sectors(bio) && bdev != bdev->bd_contains) { - struct hd_struct *p = bdev->bd_part; - - bio->bi_sector += p->start_sect; - bio->bi_bdev = bdev->bd_contains; - - trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, - bdev->bd_dev, bio->bi_sector, - bio->bi_sector - p->start_sect); - } -} - -static void handle_bad_sector(struct bio *bio) -{ - char b[BDEVNAME_SIZE]; - - printk(KERN_INFO "attempt to access beyond end of device\n"); - printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", - bdevname(bio->bi_bdev, b), - bio->bi_rw, - (unsigned long long)bio->bi_sector + bio_sectors(bio), - (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); - - set_bit(BIO_EOF, &bio->bi_flags); -} - -#ifdef CONFIG_FAIL_MAKE_REQUEST - -static DECLARE_FAULT_ATTR(fail_make_request); - -static int __init setup_fail_make_request(char *str) -{ - return setup_fault_attr(&fail_make_request, str); -} -__setup("fail_make_request=", setup_fail_make_request); - -static int should_fail_request(struct bio *bio) -{ - struct hd_struct *part = bio->bi_bdev->bd_part; - - if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail) - return should_fail(&fail_make_request, bio->bi_size); - - return 0; -} - -static int __init fail_make_request_debugfs(void) -{ - return init_fault_attr_dentries(&fail_make_request, - "fail_make_request"); -} - -late_initcall(fail_make_request_debugfs); - -#else /* CONFIG_FAIL_MAKE_REQUEST */ - -static inline int should_fail_request(struct bio *bio) -{ - return 0; -} - -#endif /* CONFIG_FAIL_MAKE_REQUEST */ - -/* - * Check whether this bio extends beyond the end of the device. - */ -static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) -{ - sector_t maxsector; - - if (!nr_sectors) - return 0; - - /* Test device or partition size, when known. */ - maxsector = bio->bi_bdev->bd_inode->i_size >> 9; - if (maxsector) { - sector_t sector = bio->bi_sector; - - if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { - /* - * This may well happen - the kernel calls bread() - * without checking the size of the device, e.g., when - * mounting a device. - */ - handle_bad_sector(bio); - return 1; - } - } - - return 0; -} - -/** - * generic_make_request - hand a buffer to its device driver for I/O - * @bio: The bio describing the location in memory and on the device. - * - * generic_make_request() is used to make I/O requests of block - * devices. It is passed a &struct bio, which describes the I/O that needs - * to be done. - * - * generic_make_request() does not return any status. The - * success/failure status of the request, along with notification of - * completion, is delivered asynchronously through the bio->bi_end_io - * function described (one day) else where. - * - * The caller of generic_make_request must make sure that bi_io_vec - * are set to describe the memory buffer, and that bi_dev and bi_sector are - * set to describe the device address, and the - * bi_end_io and optionally bi_private are set to describe how - * completion notification should be signaled. - * - * generic_make_request and the drivers it calls may use bi_next if this - * bio happens to be merged with someone else, and may change bi_dev and - * bi_sector for remaps as it sees fit. So the values of these fields - * should NOT be depended on after the call to generic_make_request. - */ -static inline void __generic_make_request(struct bio *bio) -{ - struct request_queue *q; - sector_t old_sector; - int ret, nr_sectors = bio_sectors(bio); - dev_t old_dev; - int err = -EIO; - - might_sleep(); - - if (bio_check_eod(bio, nr_sectors)) - goto end_io; - - /* - * Resolve the mapping until finished. (drivers are - * still free to implement/resolve their own stacking - * by explicitly returning 0) - * - * NOTE: we don't repeat the blk_size check for each new device. - * Stacking drivers are expected to know what they are doing. - */ - old_sector = -1; - old_dev = 0; - do { - char b[BDEVNAME_SIZE]; - - q = bdev_get_queue(bio->bi_bdev); - if (unlikely(!q)) { - printk(KERN_ERR - "generic_make_request: Trying to access " - "nonexistent block-device %s (%Lu)\n", - bdevname(bio->bi_bdev, b), - (long long) bio->bi_sector); - goto end_io; - } - - if (unlikely(nr_sectors > q->max_hw_sectors)) { - printk(KERN_ERR "bio too big device %s (%u > %u)\n", - bdevname(bio->bi_bdev, b), - bio_sectors(bio), - q->max_hw_sectors); - goto end_io; - } - - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) - goto end_io; - - if (should_fail_request(bio)) - goto end_io; - - /* - * If this device has partitions, remap block n - * of partition p to block n+start(p) of the disk. - */ - blk_partition_remap(bio); - - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) - goto end_io; - - if (old_sector != -1) - trace_block_remap(q, bio, old_dev, bio->bi_sector, - old_sector); - - trace_block_bio_queue(q, bio); - - old_sector = bio->bi_sector; - old_dev = bio->bi_bdev->bd_dev; - - if (bio_check_eod(bio, nr_sectors)) - goto end_io; - - if (bio_discard(bio) && !q->prepare_discard_fn) { - err = -EOPNOTSUPP; - goto end_io; - } - if (bio_barrier(bio) && bio_has_data(bio) && - (q->next_ordered == QUEUE_ORDERED_NONE)) { - err = -EOPNOTSUPP; - goto end_io; - } - - ret = q->make_request_fn(q, bio); - } while (ret); - - return; - -end_io: - bio_endio(bio, err); -} - -/* - * We only want one ->make_request_fn to be active at a time, - * else stack usage with stacked devices could be a problem. - * So use current->bio_{list,tail} to keep a list of requests - * submited by a make_request_fn function. - * current->bio_tail is also used as a flag to say if - * generic_make_request is currently active in this task or not. - * If it is NULL, then no make_request is active. If it is non-NULL, - * then a make_request is active, and new requests should be added - * at the tail - */ -void generic_make_request(struct bio *bio) -{ - if (current->bio_tail) { - /* make_request is active */ - *(current->bio_tail) = bio; - bio->bi_next = NULL; - current->bio_tail = &bio->bi_next; - return; - } - /* following loop may be a bit non-obvious, and so deserves some - * explanation. - * Before entering the loop, bio->bi_next is NULL (as all callers - * ensure that) so we have a list with a single bio. - * We pretend that we have just taken it off a longer list, so - * we assign bio_list to the next (which is NULL) and bio_tail - * to &bio_list, thus initialising the bio_list of new bios to be - * added. __generic_make_request may indeed add some more bios - * through a recursive call to generic_make_request. If it - * did, we find a non-NULL value in bio_list and re-enter the loop - * from the top. In this case we really did just take the bio - * of the top of the list (no pretending) and so fixup bio_list and - * bio_tail or bi_next, and call into __generic_make_request again. - * - * The loop was structured like this to make only one call to - * __generic_make_request (which is important as it is large and - * inlined) and to keep the structure simple. - */ - BUG_ON(bio->bi_next); - do { - current->bio_list = bio->bi_next; - if (bio->bi_next == NULL) - current->bio_tail = ¤t->bio_list; - else - bio->bi_next = NULL; - __generic_make_request(bio); - bio = current->bio_list; - } while (bio); - current->bio_tail = NULL; /* deactivate */ -} -EXPORT_SYMBOL(generic_make_request); - -/** - * submit_bio - submit a bio to the block device layer for I/O - * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) - * @bio: The &struct bio which describes the I/O - * - * submit_bio() is very similar in purpose to generic_make_request(), and - * uses that function to do most of the work. Both are fairly rough - * interfaces; @bio must be presetup and ready for I/O. - * - */ -void submit_bio(int rw, struct bio *bio) -{ - int count = bio_sectors(bio); - - bio->bi_rw |= rw; - - /* - * If it's a regular read/write or a barrier with data attached, - * go through the normal accounting stuff before submission. - */ - if (bio_has_data(bio)) { - if (rw & WRITE) { - count_vm_events(PGPGOUT, count); - } else { - task_io_account_read(bio->bi_size); - count_vm_events(PGPGIN, count); - } - - if (unlikely(block_dump)) { - char b[BDEVNAME_SIZE]; - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", - current->comm, task_pid_nr(current), - (rw & WRITE) ? "WRITE" : "READ", - (unsigned long long)bio->bi_sector, - bdevname(bio->bi_bdev, b)); - } - } - - generic_make_request(bio); -} -EXPORT_SYMBOL(submit_bio); - -/** - * blk_rq_check_limits - Helper function to check a request for the queue limit - * @q: the queue - * @rq: the request being checked - * - * Description: - * @rq may have been made based on weaker limitations of upper-level queues - * in request stacking drivers, and it may violate the limitation of @q. - * Since the block layer and the underlying device driver trust @rq - * after it is inserted to @q, it should be checked against @q before - * the insertion using this generic function. - * - * This function should also be useful for request stacking drivers - * in some cases below, so export this fuction. - * Request stacking drivers like request-based dm may change the queue - * limits while requests are in the queue (e.g. dm's table swapping). - * Such request stacking drivers should check those requests agaist - * the new queue limits again when they dispatch those requests, - * although such checkings are also done against the old queue limits - * when submitting requests. - */ -int blk_rq_check_limits(struct request_queue *q, struct request *rq) -{ - if (rq->nr_sectors > q->max_sectors || - rq->data_len > q->max_hw_sectors << 9) { - printk(KERN_ERR "%s: over max size limit.\n", __func__); - return -EIO; - } - - /* - * queue's settings related to segment counting like q->bounce_pfn - * may differ from that of other stacking queues. - * Recalculate it to check the request correctly on this queue's - * limitation. - */ - blk_recalc_rq_segments(rq); - if (rq->nr_phys_segments > q->max_phys_segments || - rq->nr_phys_segments > q->max_hw_segments) { - printk(KERN_ERR "%s: over max segments limit.\n", __func__); - return -EIO; - } - - return 0; -} -EXPORT_SYMBOL_GPL(blk_rq_check_limits); - -/** - * blk_insert_cloned_request - Helper for stacking drivers to submit a request - * @q: the queue to submit the request - * @rq: the request being queued - */ -int blk_insert_cloned_request(struct request_queue *q, struct request *rq) -{ - unsigned long flags; - - if (blk_rq_check_limits(q, rq)) - return -EIO; - -#ifdef CONFIG_FAIL_MAKE_REQUEST - if (rq->rq_disk && rq->rq_disk->part0.make_it_fail && - should_fail(&fail_make_request, blk_rq_bytes(rq))) - return -EIO; -#endif - - spin_lock_irqsave(q->queue_lock, flags); - - /* - * Submitting request must be dequeued before calling this function - * because it will be linked to another request_queue - */ - BUG_ON(blk_queued_rq(rq)); - - drive_stat_acct(rq, 1); - __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); - - spin_unlock_irqrestore(q->queue_lock, flags); - - return 0; -} -EXPORT_SYMBOL_GPL(blk_insert_cloned_request); - -/** - * blkdev_dequeue_request - dequeue request and start timeout timer - * @req: request to dequeue - * - * Dequeue @req and start timeout timer on it. This hands off the - * request to the driver. - * - * Block internal functions which don't want to start timer should - * call elv_dequeue_request(). - */ -void blkdev_dequeue_request(struct request *req) -{ - elv_dequeue_request(req->q, req); - - /* - * We are now handing the request to the hardware, add the - * timeout handler. - */ - blk_add_timer(req); -} -EXPORT_SYMBOL(blkdev_dequeue_request); - -static void blk_account_io_completion(struct request *req, unsigned int bytes) -{ - struct gendisk *disk = req->rq_disk; - - if (!disk || !blk_do_io_stat(disk->queue)) - return; - - if (blk_fs_request(req)) { - const int rw = rq_data_dir(req); - struct hd_struct *part; - int cpu; - - cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, req->sector); - part_stat_add(cpu, part, sectors[rw], bytes >> 9); - part_stat_unlock(); - } -} - -static void blk_account_io_done(struct request *req) -{ - struct gendisk *disk = req->rq_disk; - - if (!disk || !blk_do_io_stat(disk->queue)) - return; - - /* - * Account IO completion. bar_rq isn't accounted as a normal - * IO on queueing nor completion. Accounting the containing - * request is enough. - */ - if (blk_fs_request(req) && req != &req->q->bar_rq) { - unsigned long duration = jiffies - req->start_time; - const int rw = rq_data_dir(req); - struct hd_struct *part; - int cpu; - - cpu = part_stat_lock(); - part = disk_map_sector_rcu(disk, req->sector); - - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, ticks[rw], duration); - part_round_stats(cpu, part); - part_dec_in_flight(part); - - part_stat_unlock(); - } -} - -/** - * __end_that_request_first - end I/O on a request - * @req: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete - * - * Description: - * Ends I/O on a number of bytes attached to @req, and sets it up - * for the next range of segments (if any) in the cluster. - * - * Return: - * %0 - we are done with this request, call end_that_request_last() - * %1 - still buffers pending for this request - **/ -static int __end_that_request_first(struct request *req, int error, - int nr_bytes) -{ - int total_bytes, bio_nbytes, next_idx = 0; - struct bio *bio; - - trace_block_rq_complete(req->q, req); - - /* - * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual - * sense key with us all the way through - */ - if (!blk_pc_request(req)) - req->errors = 0; - - if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { - printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", - req->rq_disk ? req->rq_disk->disk_name : "?", - (unsigned long long)req->sector); - } - - blk_account_io_completion(req, nr_bytes); - - total_bytes = bio_nbytes = 0; - while ((bio = req->bio) != NULL) { - int nbytes; - - if (nr_bytes >= bio->bi_size) { - req->bio = bio->bi_next; - nbytes = bio->bi_size; - req_bio_endio(req, bio, nbytes, error); - next_idx = 0; - bio_nbytes = 0; - } else { - int idx = bio->bi_idx + next_idx; - - if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { - blk_dump_rq_flags(req, "__end_that"); - printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n", - __func__, bio->bi_idx, bio->bi_vcnt); - break; - } - - nbytes = bio_iovec_idx(bio, idx)->bv_len; - BIO_BUG_ON(nbytes > bio->bi_size); - - /* - * not a complete bvec done - */ - if (unlikely(nbytes > nr_bytes)) { - bio_nbytes += nr_bytes; - total_bytes += nr_bytes; - break; - } - - /* - * advance to the next vector - */ - next_idx++; - bio_nbytes += nbytes; - } - - total_bytes += nbytes; - nr_bytes -= nbytes; - - bio = req->bio; - if (bio) { - /* - * end more in this run, or just return 'not-done' - */ - if (unlikely(nr_bytes <= 0)) - break; - } - } - - /* - * completely done - */ - if (!req->bio) - return 0; - - /* - * if the request wasn't completed, update state - */ - if (bio_nbytes) { - req_bio_endio(req, bio, bio_nbytes, error); - bio->bi_idx += next_idx; - bio_iovec(bio)->bv_offset += nr_bytes; - bio_iovec(bio)->bv_len -= nr_bytes; - } - - blk_recalc_rq_sectors(req, total_bytes >> 9); - blk_recalc_rq_segments(req); - return 1; -} - -/* - * queue lock must be held - */ -static void end_that_request_last(struct request *req, int error) -{ - if (blk_rq_tagged(req)) - blk_queue_end_tag(req->q, req); - - if (blk_queued_rq(req)) - elv_dequeue_request(req->q, req); - -#ifndef DDE_LINUX - if (unlikely(laptop_mode) && blk_fs_request(req)) - laptop_io_completion(); -#endif - - blk_delete_timer(req); - - blk_account_io_done(req); - - if (req->end_io) - req->end_io(req, error); - else { - if (blk_bidi_rq(req)) - __blk_put_request(req->next_rq->q, req->next_rq); - - __blk_put_request(req->q, req); - } -} - -/** - * blk_rq_bytes - Returns bytes left to complete in the entire request - * @rq: the request being processed - **/ -unsigned int blk_rq_bytes(struct request *rq) -{ - if (blk_fs_request(rq)) - return rq->hard_nr_sectors << 9; - - return rq->data_len; -} -EXPORT_SYMBOL_GPL(blk_rq_bytes); - -/** - * blk_rq_cur_bytes - Returns bytes left to complete in the current segment - * @rq: the request being processed - **/ -unsigned int blk_rq_cur_bytes(struct request *rq) -{ - if (blk_fs_request(rq)) - return rq->current_nr_sectors << 9; - - if (rq->bio) - return rq->bio->bi_size; - - return rq->data_len; -} -EXPORT_SYMBOL_GPL(blk_rq_cur_bytes); - -/** - * end_request - end I/O on the current segment of the request - * @req: the request being processed - * @uptodate: error value or %0/%1 uptodate flag - * - * Description: - * Ends I/O on the current segment of a request. If that is the only - * remaining segment, the request is also completed and freed. - * - * This is a remnant of how older block drivers handled I/O completions. - * Modern drivers typically end I/O on the full request in one go, unless - * they have a residual value to account for. For that case this function - * isn't really useful, unless the residual just happens to be the - * full current segment. In other words, don't use this function in new - * code. Use blk_end_request() or __blk_end_request() to end a request. - **/ -void end_request(struct request *req, int uptodate) -{ - int error = 0; - - if (uptodate <= 0) - error = uptodate ? uptodate : -EIO; - - __blk_end_request(req, error, req->hard_cur_sectors << 9); -} -EXPORT_SYMBOL(end_request); - -static int end_that_request_data(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes) -{ - if (rq->bio) { - if (__end_that_request_first(rq, error, nr_bytes)) - return 1; - - /* Bidi request must be completed as a whole */ - if (blk_bidi_rq(rq) && - __end_that_request_first(rq->next_rq, error, bidi_bytes)) - return 1; - } - - return 0; -} - -/** - * blk_end_io - Generic end_io function to complete a request. - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete @rq - * @bidi_bytes: number of bytes to complete @rq->next_rq - * @drv_callback: function called between completion of bios in the request - * and completion of the request. - * If the callback returns non %0, this helper returns without - * completion of the request. - * - * Description: - * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. - * If @rq has leftover, sets it up for the next range of segments. - * - * Return: - * %0 - we are done with this request - * %1 - this request is not freed yet, it still has pending buffers. - **/ -static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, - unsigned int bidi_bytes, - int (drv_callback)(struct request *)) -{ - struct request_queue *q = rq->q; - unsigned long flags = 0UL; - - if (end_that_request_data(rq, error, nr_bytes, bidi_bytes)) - return 1; - - /* Special feature for tricky drivers */ - if (drv_callback && drv_callback(rq)) - return 1; - -#ifndef DDE_LINUX - add_disk_randomness(rq->rq_disk); -#endif - - spin_lock_irqsave(q->queue_lock, flags); - end_that_request_last(rq, error); - spin_unlock_irqrestore(q->queue_lock, flags); - - return 0; -} - -/** - * blk_end_request - Helper function for drivers to complete the request. - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete - * - * Description: - * Ends I/O on a number of bytes attached to @rq. - * If @rq has leftover, sets it up for the next range of segments. - * - * Return: - * %0 - we are done with this request - * %1 - still buffers pending for this request - **/ -int blk_end_request(struct request *rq, int error, unsigned int nr_bytes) -{ - return blk_end_io(rq, error, nr_bytes, 0, NULL); -} -EXPORT_SYMBOL_GPL(blk_end_request); - -/** - * __blk_end_request - Helper function for drivers to complete the request. - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete - * - * Description: - * Must be called with queue lock held unlike blk_end_request(). - * - * Return: - * %0 - we are done with this request - * %1 - still buffers pending for this request - **/ -int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) -{ - if (rq->bio && __end_that_request_first(rq, error, nr_bytes)) - return 1; - -#ifndef DDE_LINUX - add_disk_randomness(rq->rq_disk); -#endif - - end_that_request_last(rq, error); - - return 0; -} -EXPORT_SYMBOL_GPL(__blk_end_request); - -/** - * blk_end_bidi_request - Helper function for drivers to complete bidi request. - * @rq: the bidi request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete @rq - * @bidi_bytes: number of bytes to complete @rq->next_rq - * - * Description: - * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. - * - * Return: - * %0 - we are done with this request - * %1 - still buffers pending for this request - **/ -int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, - unsigned int bidi_bytes) -{ - return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL); -} -EXPORT_SYMBOL_GPL(blk_end_bidi_request); - -/** - * blk_update_request - Special helper function for request stacking drivers - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete @rq - * - * Description: - * Ends I/O on a number of bytes attached to @rq, but doesn't complete - * the request structure even if @rq doesn't have leftover. - * If @rq has leftover, sets it up for the next range of segments. - * - * This special helper function is only for request stacking drivers - * (e.g. request-based dm) so that they can handle partial completion. - * Actual device drivers should use blk_end_request instead. - */ -void blk_update_request(struct request *rq, int error, unsigned int nr_bytes) -{ - if (!end_that_request_data(rq, error, nr_bytes, 0)) { - /* - * These members are not updated in end_that_request_data() - * when all bios are completed. - * Update them so that the request stacking driver can find - * how many bytes remain in the request later. - */ - rq->nr_sectors = rq->hard_nr_sectors = 0; - rq->current_nr_sectors = rq->hard_cur_sectors = 0; - } -} -EXPORT_SYMBOL_GPL(blk_update_request); - -/** - * blk_end_request_callback - Special helper function for tricky drivers - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete - * @drv_callback: function called between completion of bios in the request - * and completion of the request. - * If the callback returns non %0, this helper returns without - * completion of the request. - * - * Description: - * Ends I/O on a number of bytes attached to @rq. - * If @rq has leftover, sets it up for the next range of segments. - * - * This special helper function is used only for existing tricky drivers. - * (e.g. cdrom_newpc_intr() of ide-cd) - * This interface will be removed when such drivers are rewritten. - * Don't use this interface in other places anymore. - * - * Return: - * %0 - we are done with this request - * %1 - this request is not freed yet. - * this request still has pending buffers or - * the driver doesn't want to finish this request yet. - **/ -int blk_end_request_callback(struct request *rq, int error, - unsigned int nr_bytes, - int (drv_callback)(struct request *)) -{ - return blk_end_io(rq, error, nr_bytes, 0, drv_callback); -} -EXPORT_SYMBOL_GPL(blk_end_request_callback); - -void blk_rq_bio_prep(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and - we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */ - rq->cmd_flags |= (bio->bi_rw & 3); - - if (bio_has_data(bio)) { - rq->nr_phys_segments = bio_phys_segments(q, bio); - rq->buffer = bio_data(bio); - } - rq->current_nr_sectors = bio_cur_sectors(bio); - rq->hard_cur_sectors = rq->current_nr_sectors; - rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); - rq->data_len = bio->bi_size; - - rq->bio = rq->biotail = bio; - - if (bio->bi_bdev) - rq->rq_disk = bio->bi_bdev->bd_disk; -} - -/** - * blk_lld_busy - Check if underlying low-level drivers of a device are busy - * @q : the queue of the device being checked - * - * Description: - * Check if underlying low-level drivers of a device are busy. - * If the drivers want to export their busy state, they must set own - * exporting function using blk_queue_lld_busy() first. - * - * Basically, this function is used only by request stacking drivers - * to stop dispatching requests to underlying devices when underlying - * devices are busy. This behavior helps more I/O merging on the queue - * of the request stacking driver and prevents I/O throughput regression - * on burst I/O load. - * - * Return: - * 0 - Not busy (The request stacking driver should dispatch request) - * 1 - Busy (The request stacking driver should stop dispatching request) - */ -int blk_lld_busy(struct request_queue *q) -{ - if (q->lld_busy_fn) - return q->lld_busy_fn(q); - - return 0; -} -EXPORT_SYMBOL_GPL(blk_lld_busy); - -int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) -{ - return queue_work(kblockd_workqueue, work); -} -EXPORT_SYMBOL(kblockd_schedule_work); - -int __init blk_dev_init(void) -{ - kblockd_workqueue = create_workqueue("kblockd"); - if (!kblockd_workqueue) - panic("Failed to create kblockd\n"); - - request_cachep = kmem_cache_create("blkdev_requests", - sizeof(struct request), 0, SLAB_PANIC, NULL); - - blk_requestq_cachep = kmem_cache_create("blkdev_queue", - sizeof(struct request_queue), 0, SLAB_PANIC, NULL); - - return 0; -} - diff --git a/libdde_linux26/lib/src/block/.svn/text-base/blk.h.svn-base b/libdde_linux26/lib/src/block/.svn/text-base/blk.h.svn-base deleted file mode 100644 index 0dce92c3..00000000 --- a/libdde_linux26/lib/src/block/.svn/text-base/blk.h.svn-base +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef BLK_INTERNAL_H -#define BLK_INTERNAL_H - -/* Amount of time in which a process may batch requests */ -#define BLK_BATCH_TIME (HZ/50UL) - -/* Number of requests a "batching" process may submit */ -#define BLK_BATCH_REQ 32 - -extern struct kmem_cache *blk_requestq_cachep; -extern struct kobj_type blk_queue_ktype; - -void init_request_from_bio(struct request *req, struct bio *bio); -void blk_rq_bio_prep(struct request_queue *q, struct request *rq, - struct bio *bio); -void __blk_queue_free_tags(struct request_queue *q); - -void blk_unplug_work(struct work_struct *work); -void blk_unplug_timeout(unsigned long data); -void blk_rq_timed_out_timer(unsigned long data); -void blk_delete_timer(struct request *); -void blk_add_timer(struct request *); -void __generic_unplug_device(struct request_queue *); - -/* - * Internal atomic flags for request handling - */ -enum rq_atomic_flags { - REQ_ATOM_COMPLETE = 0, -}; - -/* - * EH timer and IO completion will both attempt to 'grab' the request, make - * sure that only one of them suceeds - */ -static inline int blk_mark_rq_complete(struct request *rq) -{ - return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); -} - -static inline void blk_clear_rq_complete(struct request *rq) -{ - clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); -} - -#ifdef CONFIG_FAIL_IO_TIMEOUT -int blk_should_fake_timeout(struct request_queue *); -ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); -ssize_t part_timeout_store(struct device *, struct device_attribute *, - const char *, size_t); -#else -static inline int blk_should_fake_timeout(struct request_queue *q) -{ - return 0; -} -#endif - -struct io_context *current_io_context(gfp_t gfp_flags, int node); - -int ll_back_merge_fn(struct request_queue *q, struct request *req, - struct bio *bio); -int ll_front_merge_fn(struct request_queue *q, struct request *req, - struct bio *bio); -int attempt_back_merge(struct request_queue *q, struct request *rq); -int attempt_front_merge(struct request_queue *q, struct request *rq); -void blk_recalc_rq_segments(struct request *rq); -void blk_recalc_rq_sectors(struct request *rq, int nsect); - -void blk_queue_congestion_threshold(struct request_queue *q); - -int blk_dev_init(void); - -/* - * Return the threshold (number of used requests) at which the queue is - * considered to be congested. It include a little hysteresis to keep the - * context switch rate down. - */ -static inline int queue_congestion_on_threshold(struct request_queue *q) -{ - return q->nr_congestion_on; -} - -/* - * The threshold at which a queue is considered to be uncongested - */ -static inline int queue_congestion_off_threshold(struct request_queue *q) -{ - return q->nr_congestion_off; -} - -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -#define rq_for_each_integrity_segment(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i) - -#endif /* BLK_DEV_INTEGRITY */ - -static inline int blk_cpu_to_group(int cpu) -{ -#ifdef CONFIG_SCHED_MC - const struct cpumask *mask = cpu_coregroup_mask(cpu); - return cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - return first_cpu(per_cpu(cpu_sibling_map, cpu)); -#else - return cpu; -#endif -} - -static inline int blk_do_io_stat(struct request_queue *q) -{ - if (q) - return blk_queue_io_stat(q); - - return 0; -} - -#endif diff --git a/libdde_linux26/lib/src/block/.svn/text-base/genhd.c.svn-base b/libdde_linux26/lib/src/block/.svn/text-base/genhd.c.svn-base deleted file mode 100644 index 921cebff..00000000 --- a/libdde_linux26/lib/src/block/.svn/text-base/genhd.c.svn-base +++ /dev/null @@ -1,1248 +0,0 @@ -/* - * gendisk handling - */ - -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/kdev_t.h> -#include <linux/kernel.h> -#include <linux/blkdev.h> -#include <linux/init.h> -#include <linux/spinlock.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/kmod.h> -#include <linux/kobj_map.h> -#include <linux/buffer_head.h> -#include <linux/mutex.h> -#include <linux/idr.h> - -#include "blk.h" -#ifdef DDE_LINUX -#include "local.h" -#endif - -static DEFINE_MUTEX(block_class_lock); -#ifndef CONFIG_SYSFS_DEPRECATED -struct kobject *block_depr; -#endif - -/* for extended dynamic devt allocation, currently only one major is used */ -#define MAX_EXT_DEVT (1 << MINORBITS) - -/* For extended devt allocation. ext_devt_mutex prevents look up - * results from going away underneath its user. - */ -static DEFINE_MUTEX(ext_devt_mutex); -static DEFINE_IDR(ext_devt_idr); - -static struct device_type disk_type; - -/** - * disk_get_part - get partition - * @disk: disk to look partition from - * @partno: partition number - * - * Look for partition @partno from @disk. If found, increment - * reference count and return it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Pointer to the found partition on success, NULL if not found. - */ -struct hd_struct *disk_get_part(struct gendisk *disk, int partno) -{ - struct hd_struct *part = NULL; - struct disk_part_tbl *ptbl; - - if (unlikely(partno < 0)) - return NULL; - - rcu_read_lock(); - - ptbl = rcu_dereference(disk->part_tbl); - if (likely(partno < ptbl->len)) { - part = rcu_dereference(ptbl->part[partno]); - if (part) - get_device(part_to_dev(part)); - } - - rcu_read_unlock(); - - return part; -} -EXPORT_SYMBOL_GPL(disk_get_part); - -/** - * disk_part_iter_init - initialize partition iterator - * @piter: iterator to initialize - * @disk: disk to iterate over - * @flags: DISK_PITER_* flags - * - * Initialize @piter so that it iterates over partitions of @disk. - * - * CONTEXT: - * Don't care. - */ -void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, - unsigned int flags) -{ - struct disk_part_tbl *ptbl; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - - piter->disk = disk; - piter->part = NULL; - - if (flags & DISK_PITER_REVERSE) - piter->idx = ptbl->len - 1; - else if (flags & DISK_PITER_INCL_PART0) - piter->idx = 0; - else - piter->idx = 1; - - piter->flags = flags; - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(disk_part_iter_init); - -/** - * disk_part_iter_next - proceed iterator to the next partition and return it - * @piter: iterator of interest - * - * Proceed @piter to the next partition and return it. - * - * CONTEXT: - * Don't care. - */ -struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) -{ - struct disk_part_tbl *ptbl; - int inc, end; - - /* put the last partition */ - disk_put_part(piter->part); - piter->part = NULL; - - /* get part_tbl */ - rcu_read_lock(); - ptbl = rcu_dereference(piter->disk->part_tbl); - - /* determine iteration parameters */ - if (piter->flags & DISK_PITER_REVERSE) { - inc = -1; - if (piter->flags & DISK_PITER_INCL_PART0) - end = -1; - else - end = 0; - } else { - inc = 1; - end = ptbl->len; - } - - /* iterate to the next partition */ - for (; piter->idx != end; piter->idx += inc) { - struct hd_struct *part; - - part = rcu_dereference(ptbl->part[piter->idx]); - if (!part) - continue; - if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects) - continue; - - get_device(part_to_dev(part)); - piter->part = part; - piter->idx += inc; - break; - } - - rcu_read_unlock(); - - return piter->part; -} -EXPORT_SYMBOL_GPL(disk_part_iter_next); - -/** - * disk_part_iter_exit - finish up partition iteration - * @piter: iter of interest - * - * Called when iteration is over. Cleans up @piter. - * - * CONTEXT: - * Don't care. - */ -void disk_part_iter_exit(struct disk_part_iter *piter) -{ - disk_put_part(piter->part); - piter->part = NULL; -} -EXPORT_SYMBOL_GPL(disk_part_iter_exit); - -static inline int sector_in_part(struct hd_struct *part, sector_t sector) -{ - return part->start_sect <= sector && - sector < part->start_sect + part->nr_sects; -} - -/** - * disk_map_sector_rcu - map sector to partition - * @disk: gendisk of interest - * @sector: sector to map - * - * Find out which partition @sector maps to on @disk. This is - * primarily used for stats accounting. - * - * CONTEXT: - * RCU read locked. The returned partition pointer is valid only - * while preemption is disabled. - * - * RETURNS: - * Found partition on success, part0 is returned if no partition matches - */ -struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) -{ - struct disk_part_tbl *ptbl; - struct hd_struct *part; - int i; - - ptbl = rcu_dereference(disk->part_tbl); - - part = rcu_dereference(ptbl->last_lookup); - if (part && sector_in_part(part, sector)) - return part; - - for (i = 1; i < ptbl->len; i++) { - part = rcu_dereference(ptbl->part[i]); - - if (part && sector_in_part(part, sector)) { - rcu_assign_pointer(ptbl->last_lookup, part); - return part; - } - } - return &disk->part0; -} -EXPORT_SYMBOL_GPL(disk_map_sector_rcu); - -/* - * Can be deleted altogether. Later. - * - */ -static struct blk_major_name { - struct blk_major_name *next; - int major; - char name[16]; -} *major_names[BLKDEV_MAJOR_HASH_SIZE]; - -/* index in the above - for now: assume no multimajor ranges */ -static inline int major_to_index(int major) -{ - return major % BLKDEV_MAJOR_HASH_SIZE; -} - -#ifdef CONFIG_PROC_FS -void blkdev_show(struct seq_file *seqf, off_t offset) -{ - struct blk_major_name *dp; - - if (offset < BLKDEV_MAJOR_HASH_SIZE) { - mutex_lock(&block_class_lock); - for (dp = major_names[offset]; dp; dp = dp->next) - seq_printf(seqf, "%3d %s\n", dp->major, dp->name); - mutex_unlock(&block_class_lock); - } -} -#endif /* CONFIG_PROC_FS */ - -/** - * register_blkdev - register a new block device - * - * @major: the requested major device number [1..255]. If @major=0, try to - * allocate any unused major number. - * @name: the name of the new block device as a zero terminated string - * - * The @name must be unique within the system. - * - * The return value depends on the @major input parameter. - * - if a major device number was requested in range [1..255] then the - * function returns zero on success, or a negative error code - * - if any unused major number was requested with @major=0 parameter - * then the return value is the allocated major number in range - * [1..255] or a negative error code otherwise - */ -int register_blkdev(unsigned int major, const char *name) -{ - struct blk_major_name **n, *p; - int index, ret = 0; - - mutex_lock(&block_class_lock); - - /* temporary */ - if (major == 0) { - for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) { - if (major_names[index] == NULL) - break; - } - - if (index == 0) { - printk("register_blkdev: failed to get major for %s\n", - name); - ret = -EBUSY; - goto out; - } - major = index; - ret = major; - } - - p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL); - if (p == NULL) { - ret = -ENOMEM; - goto out; - } - - p->major = major; - strlcpy(p->name, name, sizeof(p->name)); - p->next = NULL; - index = major_to_index(major); - - for (n = &major_names[index]; *n; n = &(*n)->next) { - if ((*n)->major == major) - break; - } - if (!*n) - *n = p; - else - ret = -EBUSY; - - if (ret < 0) { - printk("register_blkdev: cannot get major %d for %s\n", - major, name); - kfree(p); - } -out: - mutex_unlock(&block_class_lock); - return ret; -} - -EXPORT_SYMBOL(register_blkdev); - -void unregister_blkdev(unsigned int major, const char *name) -{ - struct blk_major_name **n; - struct blk_major_name *p = NULL; - int index = major_to_index(major); - - mutex_lock(&block_class_lock); - for (n = &major_names[index]; *n; n = &(*n)->next) - if ((*n)->major == major) - break; - if (!*n || strcmp((*n)->name, name)) { - WARN_ON(1); - } else { - p = *n; - *n = p->next; - } - mutex_unlock(&block_class_lock); - kfree(p); -} - -EXPORT_SYMBOL(unregister_blkdev); - -static struct kobj_map *bdev_map; - -/** - * blk_mangle_minor - scatter minor numbers apart - * @minor: minor number to mangle - * - * Scatter consecutively allocated @minor number apart if MANGLE_DEVT - * is enabled. Mangling twice gives the original value. - * - * RETURNS: - * Mangled value. - * - * CONTEXT: - * Don't care. - */ -static int blk_mangle_minor(int minor) -{ -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT - int i; - - for (i = 0; i < MINORBITS / 2; i++) { - int low = minor & (1 << i); - int high = minor & (1 << (MINORBITS - 1 - i)); - int distance = MINORBITS - 1 - 2 * i; - - minor ^= low | high; /* clear both bits */ - low <<= distance; /* swap the positions */ - high >>= distance; - minor |= low | high; /* and set */ - } -#endif - return minor; -} - -/** - * blk_alloc_devt - allocate a dev_t for a partition - * @part: partition to allocate dev_t for - * @devt: out parameter for resulting dev_t - * - * Allocate a dev_t for block device. - * - * RETURNS: - * 0 on success, allocated dev_t is returned in *@devt. -errno on - * failure. - * - * CONTEXT: - * Might sleep. - */ -int blk_alloc_devt(struct hd_struct *part, dev_t *devt) -{ - struct gendisk *disk = part_to_disk(part); - int idx, rc; - - /* in consecutive minor range? */ - if (part->partno < disk->minors) { - *devt = MKDEV(disk->major, disk->first_minor + part->partno); - return 0; - } - - /* allocate ext devt */ - do { - if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL)) - return -ENOMEM; - rc = idr_get_new(&ext_devt_idr, part, &idx); - } while (rc == -EAGAIN); - - if (rc) - return rc; - - if (idx > MAX_EXT_DEVT) { - idr_remove(&ext_devt_idr, idx); - return -EBUSY; - } - - *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); - return 0; -} - -/** - * blk_free_devt - free a dev_t - * @devt: dev_t to free - * - * Free @devt which was allocated using blk_alloc_devt(). - * - * CONTEXT: - * Might sleep. - */ -void blk_free_devt(dev_t devt) -{ - might_sleep(); - - if (devt == MKDEV(0, 0)) - return; - - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - mutex_lock(&ext_devt_mutex); - idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - mutex_unlock(&ext_devt_mutex); - } -} - -static char *bdevt_str(dev_t devt, char *buf) -{ - if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { - char tbuf[BDEVT_SIZE]; - snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt)); - snprintf(buf, BDEVT_SIZE, "%-9s", tbuf); - } else - snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); - - return buf; -} - -/* - * Register device numbers dev..(dev+range-1) - * range must be nonzero - * The hash chain is sorted on range, so that subranges can override. - */ -void blk_register_region(dev_t devt, unsigned long range, struct module *module, - struct kobject *(*probe)(dev_t, int *, void *), - int (*lock)(dev_t, void *), void *data) -{ - kobj_map(bdev_map, devt, range, module, probe, lock, data); -} - -EXPORT_SYMBOL(blk_register_region); - -void blk_unregister_region(dev_t devt, unsigned long range) -{ - kobj_unmap(bdev_map, devt, range); -} - -EXPORT_SYMBOL(blk_unregister_region); - -static struct kobject *exact_match(dev_t devt, int *partno, void *data) -{ - struct gendisk *p = data; - - return &disk_to_dev(p)->kobj; -} - -static int exact_lock(dev_t devt, void *data) -{ - struct gendisk *p = data; - - if (!get_disk(p)) - return -1; - return 0; -} - -#ifndef DDE_LINUX -/** - * add_disk - add partitioning information to kernel list - * @disk: per-device partitioning information - * - * This function registers the partitioning information in @disk - * with the kernel. - * - * FIXME: error handling - */ -void add_disk(struct gendisk *disk) -{ - struct backing_dev_info *bdi; - dev_t devt; - int retval; - - /* minors == 0 indicates to use ext devt from part0 and should - * be accompanied with EXT_DEVT flag. Make sure all - * parameters make sense. - */ - WARN_ON(disk->minors && !(disk->major || disk->first_minor)); - WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT)); - - disk->flags |= GENHD_FL_UP; - - retval = blk_alloc_devt(&disk->part0, &devt); - if (retval) { - WARN_ON(1); - return; - } - disk_to_dev(disk)->devt = devt; - - /* ->major and ->first_minor aren't supposed to be - * dereferenced from here on, but set them just in case. - */ - disk->major = MAJOR(devt); - disk->first_minor = MINOR(devt); - - blk_register_region(disk_devt(disk), disk->minors, NULL, - exact_match, exact_lock, disk); - register_disk(disk); - blk_register_queue(disk); - - bdi = &disk->queue->backing_dev_info; - bdi_register_dev(bdi, disk_devt(disk)); - retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, - "bdi"); - WARN_ON(retval); -} - -EXPORT_SYMBOL(add_disk); -EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ -#endif - -void unlink_gendisk(struct gendisk *disk) -{ - sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); - bdi_unregister(&disk->queue->backing_dev_info); - blk_unregister_queue(disk); - blk_unregister_region(disk_devt(disk), disk->minors); -} - -#ifndef DDE_LINUX -/** - * get_gendisk - get partitioning information for a given device - * @devt: device to get partitioning information for - * @partno: returned partition index - * - * This function gets the structure containing partitioning - * information for the given device @devt. - */ -struct gendisk *get_gendisk(dev_t devt, int *partno) -{ - struct gendisk *disk = NULL; - - if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - struct kobject *kobj; - - kobj = kobj_lookup(bdev_map, devt, partno); - if (kobj) - disk = dev_to_disk(kobj_to_dev(kobj)); - } else { - struct hd_struct *part; - - mutex_lock(&ext_devt_mutex); - part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - if (part && get_disk(part_to_disk(part))) { - *partno = part->partno; - disk = part_to_disk(part); - } - mutex_unlock(&ext_devt_mutex); - } - - return disk; -} -#endif - -/** - * bdget_disk - do bdget() by gendisk and partition number - * @disk: gendisk of interest - * @partno: partition number - * - * Find partition @partno from @disk, do bdget() on it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Resulting block_device on success, NULL on failure. - */ -struct block_device *bdget_disk(struct gendisk *disk, int partno) -{ - struct hd_struct *part; - struct block_device *bdev = NULL; - - part = disk_get_part(disk, partno); - if (part) - bdev = bdget(part_devt(part)); - disk_put_part(part); - - return bdev; -} -EXPORT_SYMBOL(bdget_disk); - -/* - * print a full list of all partitions - intended for places where the root - * filesystem can't be mounted and thus to give the victim some idea of what - * went wrong - */ -void __init printk_all_partitions(void) -{ - struct class_dev_iter iter; - struct device *dev; - - class_dev_iter_init(&iter, &block_class, NULL, &disk_type); - while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct disk_part_iter piter; - struct hd_struct *part; - char name_buf[BDEVNAME_SIZE]; - char devt_buf[BDEVT_SIZE]; - - /* - * Don't show empty devices or things that have been - * surpressed - */ - if (get_capacity(disk) == 0 || - (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) - continue; - - /* - * Note, unlike /proc/partitions, I am showing the - * numbers in hex - the same format as the root= - * option takes. - */ - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) { - bool is_part0 = part == &disk->part0; - - printk("%s%s %10llu %s", is_part0 ? "" : " ", - bdevt_str(part_devt(part), devt_buf), - (unsigned long long)part->nr_sects >> 1, - disk_name(disk, part->partno, name_buf)); - if (is_part0) { - if (disk->driverfs_dev != NULL && - disk->driverfs_dev->driver != NULL) - printk(" driver: %s\n", - disk->driverfs_dev->driver->name); - else - printk(" (driver?)\n"); - } else - printk("\n"); - } - disk_part_iter_exit(&piter); - } - class_dev_iter_exit(&iter); -} - -#ifdef CONFIG_PROC_FS -/* iterator */ -static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) -{ - loff_t skip = *pos; - struct class_dev_iter *iter; - struct device *dev; - - iter = kmalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return ERR_PTR(-ENOMEM); - - seqf->private = iter; - class_dev_iter_init(iter, &block_class, NULL, &disk_type); - do { - dev = class_dev_iter_next(iter); - if (!dev) - return NULL; - } while (skip--); - - return dev_to_disk(dev); -} - -static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos) -{ - struct device *dev; - - (*pos)++; - dev = class_dev_iter_next(seqf->private); - if (dev) - return dev_to_disk(dev); - - return NULL; -} - -static void disk_seqf_stop(struct seq_file *seqf, void *v) -{ - struct class_dev_iter *iter = seqf->private; - - /* stop is called even after start failed :-( */ - if (iter) { - class_dev_iter_exit(iter); - kfree(iter); - } -} - -static void *show_partition_start(struct seq_file *seqf, loff_t *pos) -{ - static void *p; - - p = disk_seqf_start(seqf, pos); - if (!IS_ERR(p) && p && !*pos) - seq_puts(seqf, "major minor #blocks name\n\n"); - return p; -} - -static int show_partition(struct seq_file *seqf, void *v) -{ - struct gendisk *sgp = v; - struct disk_part_iter piter; - struct hd_struct *part; - char buf[BDEVNAME_SIZE]; - - /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_partitionable(sgp) && - (sgp->flags & GENHD_FL_REMOVABLE))) - return 0; - if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) - return 0; - - /* show the full disk and all non-0 size partitions of it */ - disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) - seq_printf(seqf, "%4d %7d %10llu %s\n", - MAJOR(part_devt(part)), MINOR(part_devt(part)), - (unsigned long long)part->nr_sects >> 1, - disk_name(sgp, part->partno, buf)); - disk_part_iter_exit(&piter); - - return 0; -} - -static const struct seq_operations partitions_op = { - .start = show_partition_start, - .next = disk_seqf_next, - .stop = disk_seqf_stop, - .show = show_partition -}; - -static int partitions_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &partitions_op); -} - -static const struct file_operations proc_partitions_operations = { - .open = partitions_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - - -static struct kobject *base_probe(dev_t devt, int *partno, void *data) -{ - if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) - /* Make old-style 2.4 aliases work */ - request_module("block-major-%d", MAJOR(devt)); - return NULL; -} - -static int __init genhd_device_init(void) -{ - int error; - - block_class.dev_kobj = sysfs_dev_block_kobj; - error = class_register(&block_class); - if (unlikely(error)) - return error; - bdev_map = kobj_map_init(base_probe, &block_class_lock); - blk_dev_init(); - - register_blkdev(BLOCK_EXT_MAJOR, "blkext"); - -#ifndef CONFIG_SYSFS_DEPRECATED - /* create top-level block dir */ - block_depr = kobject_create_and_add("block", NULL); -#endif - return 0; -} - -subsys_initcall(genhd_device_init); - -static ssize_t disk_range_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - return sprintf(buf, "%d\n", disk->minors); -} - -static ssize_t disk_ext_range_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - return sprintf(buf, "%d\n", disk_max_parts(disk)); -} - -static ssize_t disk_removable_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - return sprintf(buf, "%d\n", - (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); -} - -static ssize_t disk_ro_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); -} - -static ssize_t disk_capability_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - return sprintf(buf, "%x\n", disk->flags); -} - -static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); -static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); -static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); -static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); -static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); -static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); -static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); -#ifdef CONFIG_FAIL_MAKE_REQUEST -static struct device_attribute dev_attr_fail = - __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); -#endif -#ifdef CONFIG_FAIL_IO_TIMEOUT -static struct device_attribute dev_attr_fail_timeout = - __ATTR(io-timeout-fail, S_IRUGO|S_IWUSR, part_timeout_show, - part_timeout_store); -#endif - -static struct attribute *disk_attrs[] = { - &dev_attr_range.attr, - &dev_attr_ext_range.attr, - &dev_attr_removable.attr, - &dev_attr_ro.attr, - &dev_attr_size.attr, - &dev_attr_capability.attr, - &dev_attr_stat.attr, -#ifdef CONFIG_FAIL_MAKE_REQUEST - &dev_attr_fail.attr, -#endif -#ifdef CONFIG_FAIL_IO_TIMEOUT - &dev_attr_fail_timeout.attr, -#endif - NULL -}; - -static struct attribute_group disk_attr_group = { - .attrs = disk_attrs, -}; - -static struct attribute_group *disk_attr_groups[] = { - &disk_attr_group, - NULL -}; - -static void disk_free_ptbl_rcu_cb(struct rcu_head *head) -{ - struct disk_part_tbl *ptbl = - container_of(head, struct disk_part_tbl, rcu_head); - - kfree(ptbl); -} - -/** - * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way - * @disk: disk to replace part_tbl for - * @new_ptbl: new part_tbl to install - * - * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The - * original ptbl is freed using RCU callback. - * - * LOCKING: - * Matching bd_mutx locked. - */ -static void disk_replace_part_tbl(struct gendisk *disk, - struct disk_part_tbl *new_ptbl) -{ - struct disk_part_tbl *old_ptbl = disk->part_tbl; - - rcu_assign_pointer(disk->part_tbl, new_ptbl); - - if (old_ptbl) { - rcu_assign_pointer(old_ptbl->last_lookup, NULL); -#ifndef DDE_LINUX - call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); -#else - disk_free_ptbl_rcu_cb(&old_ptbl->rcu_head); -#endif - } -} - -/** - * disk_expand_part_tbl - expand disk->part_tbl - * @disk: disk to expand part_tbl for - * @partno: expand such that this partno can fit in - * - * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl - * uses RCU to allow unlocked dereferencing for stats and other stuff. - * - * LOCKING: - * Matching bd_mutex locked, might sleep. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int disk_expand_part_tbl(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *old_ptbl = disk->part_tbl; - struct disk_part_tbl *new_ptbl; - int len = old_ptbl ? old_ptbl->len : 0; - int target = partno + 1; - size_t size; - int i; - - /* disk_max_parts() is zero during initialization, ignore if so */ - if (disk_max_parts(disk) && target > disk_max_parts(disk)) - return -EINVAL; - - if (target <= len) - return 0; - - size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]); - new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id); - if (!new_ptbl) - return -ENOMEM; - - INIT_RCU_HEAD(&new_ptbl->rcu_head); - new_ptbl->len = target; - - for (i = 0; i < len; i++) - rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); - - disk_replace_part_tbl(disk, new_ptbl); - return 0; -} - -static void disk_release(struct device *dev) -{ - struct gendisk *disk = dev_to_disk(dev); - - kfree(disk->random); - disk_replace_part_tbl(disk, NULL); - free_part_stats(&disk->part0); - kfree(disk); -} -struct class block_class = { - .name = "block", -}; - -static struct device_type disk_type = { - .name = "disk", - .groups = disk_attr_groups, - .release = disk_release, -}; - -#ifdef CONFIG_PROC_FS -/* - * aggregate disk stat collector. Uses the same stats that the sysfs - * entries do, above, but makes them available through one seq_file. - * - * The output looks suspiciously like /proc/partitions with a bunch of - * extra fields. - */ -static int diskstats_show(struct seq_file *seqf, void *v) -{ - struct gendisk *gp = v; - struct disk_part_iter piter; - struct hd_struct *hd; - char buf[BDEVNAME_SIZE]; - int cpu; - - /* - if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) - seq_puts(seqf, "major minor name" - " rio rmerge rsect ruse wio wmerge " - "wsect wuse running use aveq" - "\n\n"); - */ - - disk_part_iter_init(&piter, gp, DISK_PITER_INCL_PART0); - while ((hd = disk_part_iter_next(&piter))) { - cpu = part_stat_lock(); - part_round_stats(cpu, hd); - part_stat_unlock(); - seq_printf(seqf, "%4d %7d %s %lu %lu %llu " - "%u %lu %lu %llu %u %u %u %u\n", - MAJOR(part_devt(hd)), MINOR(part_devt(hd)), - disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[0]), - part_stat_read(hd, merges[0]), - (unsigned long long)part_stat_read(hd, sectors[0]), - jiffies_to_msecs(part_stat_read(hd, ticks[0])), - part_stat_read(hd, ios[1]), - part_stat_read(hd, merges[1]), - (unsigned long long)part_stat_read(hd, sectors[1]), - jiffies_to_msecs(part_stat_read(hd, ticks[1])), - hd->in_flight, - jiffies_to_msecs(part_stat_read(hd, io_ticks)), - jiffies_to_msecs(part_stat_read(hd, time_in_queue)) - ); - } - disk_part_iter_exit(&piter); - - return 0; -} - -static const struct seq_operations diskstats_op = { - .start = disk_seqf_start, - .next = disk_seqf_next, - .stop = disk_seqf_stop, - .show = diskstats_show -}; - -static int diskstats_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &diskstats_op); -} - -static const struct file_operations proc_diskstats_operations = { - .open = diskstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init proc_genhd_init(void) -{ - proc_create("diskstats", 0, NULL, &proc_diskstats_operations); - proc_create("partitions", 0, NULL, &proc_partitions_operations); - return 0; -} -module_init(proc_genhd_init); -#endif /* CONFIG_PROC_FS */ - -static void media_change_notify_thread(struct work_struct *work) -{ - struct gendisk *gd = container_of(work, struct gendisk, async_notify); - char event[] = "MEDIA_CHANGE=1"; - char *envp[] = { event, NULL }; - - /* - * set enviroment vars to indicate which event this is for - * so that user space will know to go check the media status. - */ - kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); - put_device(gd->driverfs_dev); -} - -#if 0 -void genhd_media_change_notify(struct gendisk *disk) -{ - get_device(disk->driverfs_dev); - schedule_work(&disk->async_notify); -} -EXPORT_SYMBOL_GPL(genhd_media_change_notify); -#endif /* 0 */ - -dev_t blk_lookup_devt(const char *name, int partno) -{ - dev_t devt = MKDEV(0, 0); - struct class_dev_iter iter; - struct device *dev; - - class_dev_iter_init(&iter, &block_class, NULL, &disk_type); - while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part; - - if (strcmp(dev_name(dev), name)) - continue; - - if (partno < disk->minors) { - /* We need to return the right devno, even - * if the partition doesn't exist yet. - */ - devt = MKDEV(MAJOR(dev->devt), - MINOR(dev->devt) + partno); - break; - } - part = disk_get_part(disk, partno); - if (part) { - devt = part_devt(part); - disk_put_part(part); - break; - } - disk_put_part(part); - } - class_dev_iter_exit(&iter); - return devt; -} -EXPORT_SYMBOL(blk_lookup_devt); - -struct gendisk *alloc_disk(int minors) -{ - return alloc_disk_node(minors, -1); -} -EXPORT_SYMBOL(alloc_disk); - -struct gendisk *alloc_disk_node(int minors, int node_id) -{ - struct gendisk *disk; - - disk = kmalloc_node(sizeof(struct gendisk), - GFP_KERNEL | __GFP_ZERO, node_id); - if (disk) { - if (!init_part_stats(&disk->part0)) { - kfree(disk); - return NULL; - } - disk->node_id = node_id; - if (disk_expand_part_tbl(disk, 0)) { - free_part_stats(&disk->part0); - kfree(disk); - return NULL; - } - disk->part_tbl->part[0] = &disk->part0; - - disk->minors = minors; -#ifndef DDE_LINUX - rand_initialize_disk(disk); -#endif - disk_to_dev(disk)->class = &block_class; - disk_to_dev(disk)->type = &disk_type; - device_initialize(disk_to_dev(disk)); - INIT_WORK(&disk->async_notify, - media_change_notify_thread); - } - return disk; -} -EXPORT_SYMBOL(alloc_disk_node); - -struct kobject *get_disk(struct gendisk *disk) -{ - struct module *owner; - struct kobject *kobj; - - if (!disk->fops) - return NULL; - owner = disk->fops->owner; - if (owner && !try_module_get(owner)) - return NULL; - kobj = kobject_get(&disk_to_dev(disk)->kobj); - if (kobj == NULL) { - module_put(owner); - return NULL; - } - return kobj; - -} - -EXPORT_SYMBOL(get_disk); - -void put_disk(struct gendisk *disk) -{ - if (disk) - kobject_put(&disk_to_dev(disk)->kobj); -} - -EXPORT_SYMBOL(put_disk); - -void set_device_ro(struct block_device *bdev, int flag) -{ - bdev->bd_part->policy = flag; -} - -EXPORT_SYMBOL(set_device_ro); - -void set_disk_ro(struct gendisk *disk, int flag) -{ - struct disk_part_iter piter; - struct hd_struct *part; - - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) - part->policy = flag; - disk_part_iter_exit(&piter); -} - -EXPORT_SYMBOL(set_disk_ro); - -int bdev_read_only(struct block_device *bdev) -{ - if (!bdev) - return 0; - return bdev->bd_part->policy; -} - -EXPORT_SYMBOL(bdev_read_only); - -int invalidate_partition(struct gendisk *disk, int partno) -{ - int res = 0; - struct block_device *bdev = bdget_disk(disk, partno); - if (bdev) { - fsync_bdev(bdev); - res = __invalidate_device(bdev); - bdput(bdev); - } - return res; -} - -EXPORT_SYMBOL(invalidate_partition); diff --git a/libdde_linux26/lib/src/drivers/.svn/all-wcprops b/libdde_linux26/lib/src/drivers/.svn/all-wcprops deleted file mode 100644 index 7486eb7c..00000000 --- a/libdde_linux26/lib/src/drivers/.svn/all-wcprops +++ /dev/null @@ -1,5 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 66 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/drivers -END diff --git a/libdde_linux26/lib/src/drivers/.svn/entries b/libdde_linux26/lib/src/drivers/.svn/entries deleted file mode 100644 index 92a3cce5..00000000 --- a/libdde_linux26/lib/src/drivers/.svn/entries +++ /dev/null @@ -1,34 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/drivers -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -base -dir - -pci -dir - diff --git a/libdde_linux26/lib/src/drivers/.svn/format b/libdde_linux26/lib/src/drivers/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/drivers/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/drivers/base/.svn/all-wcprops b/libdde_linux26/lib/src/drivers/base/.svn/all-wcprops deleted file mode 100644 index b4d5f8c7..00000000 --- a/libdde_linux26/lib/src/drivers/base/.svn/all-wcprops +++ /dev/null @@ -1,23 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 71 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/drivers/base -END -init.c -K 25 -svn:wc:ra_dav:version-url -V 78 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/drivers/base/init.c -END -class.c -K 25 -svn:wc:ra_dav:version-url -V 79 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/drivers/base/class.c -END -core.c -K 25 -svn:wc:ra_dav:version-url -V 78 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/drivers/base/core.c -END diff --git a/libdde_linux26/lib/src/drivers/base/.svn/entries b/libdde_linux26/lib/src/drivers/base/.svn/entries deleted file mode 100644 index ad16ddef..00000000 --- a/libdde_linux26/lib/src/drivers/base/.svn/entries +++ /dev/null @@ -1,130 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/drivers/base -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -init.c -file - - - - -2009-11-15T17:17:12.000000Z -17b65b620f4c532617f53d0bf9125a05 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -795 - -class.c -file - - - - -2009-11-15T17:17:12.000000Z -e33913dbbb3e7526c2b6144e41ad7ad7 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -12685 - -core.c -file - - - - -2009-11-15T17:17:12.000000Z -caba481446dd962e54a2a9e5ebcaa3bc -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -42233 - diff --git a/libdde_linux26/lib/src/drivers/base/.svn/format b/libdde_linux26/lib/src/drivers/base/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/drivers/base/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/drivers/base/.svn/text-base/class.c.svn-base b/libdde_linux26/lib/src/drivers/base/.svn/text-base/class.c.svn-base deleted file mode 100644 index 1417d80b..00000000 --- a/libdde_linux26/lib/src/drivers/base/.svn/text-base/class.c.svn-base +++ /dev/null @@ -1,505 +0,0 @@ -/* - * class.c - basic device class management - * - * Copyright (c) 2002-3 Patrick Mochel - * Copyright (c) 2002-3 Open Source Development Labs - * Copyright (c) 2003-2004 Greg Kroah-Hartman - * Copyright (c) 2003-2004 IBM Corp. - * - * This file is released under the GPLv2 - * - */ - -#include <linux/device.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/kdev_t.h> -#include <linux/err.h> -#include <linux/slab.h> -#include <linux/genhd.h> -#include <linux/mutex.h> -#include "base.h" - -#define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr) - -static ssize_t class_attr_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct class_attribute *class_attr = to_class_attr(attr); - struct class_private *cp = to_class(kobj); - ssize_t ret = -EIO; - - if (class_attr->show) - ret = class_attr->show(cp->class, buf); - return ret; -} - -static ssize_t class_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t count) -{ - struct class_attribute *class_attr = to_class_attr(attr); - struct class_private *cp = to_class(kobj); - ssize_t ret = -EIO; - - if (class_attr->store) - ret = class_attr->store(cp->class, buf, count); - return ret; -} - -static void class_release(struct kobject *kobj) -{ - struct class_private *cp = to_class(kobj); - struct class *class = cp->class; - - pr_debug("class '%s': release.\n", class->name); - - if (class->class_release) - class->class_release(class); - else - pr_debug("class '%s' does not have a release() function, " - "be careful\n", class->name); -} - -static struct sysfs_ops class_sysfs_ops = { - .show = class_attr_show, - .store = class_attr_store, -}; - -static struct kobj_type class_ktype = { - .sysfs_ops = &class_sysfs_ops, - .release = class_release, -}; - -/* Hotplug events for classes go to the class class_subsys */ -static struct kset *class_kset; - - -int class_create_file(struct class *cls, const struct class_attribute *attr) -{ - int error; - if (cls) - error = sysfs_create_file(&cls->p->class_subsys.kobj, - &attr->attr); - else - error = -EINVAL; - return error; -} - -void class_remove_file(struct class *cls, const struct class_attribute *attr) -{ - if (cls) - sysfs_remove_file(&cls->p->class_subsys.kobj, &attr->attr); -} - -static struct class *class_get(struct class *cls) -{ - if (cls) - kset_get(&cls->p->class_subsys); - return cls; -} - -static void class_put(struct class *cls) -{ - if (cls) - kset_put(&cls->p->class_subsys); -} - -static int add_class_attrs(struct class *cls) -{ - int i; - int error = 0; - - if (cls->class_attrs) { - for (i = 0; attr_name(cls->class_attrs[i]); i++) { - error = class_create_file(cls, &cls->class_attrs[i]); - if (error) - goto error; - } - } -done: - return error; -error: - while (--i >= 0) - class_remove_file(cls, &cls->class_attrs[i]); - goto done; -} - -static void remove_class_attrs(struct class *cls) -{ - int i; - - if (cls->class_attrs) { - for (i = 0; attr_name(cls->class_attrs[i]); i++) - class_remove_file(cls, &cls->class_attrs[i]); - } -} - -static void klist_class_dev_get(struct klist_node *n) -{ - struct device *dev = container_of(n, struct device, knode_class); - - get_device(dev); -} - -static void klist_class_dev_put(struct klist_node *n) -{ - struct device *dev = container_of(n, struct device, knode_class); - - put_device(dev); -} - -int __class_register(struct class *cls, struct lock_class_key *key) -{ - struct class_private *cp; - int error; - - pr_debug("device class '%s': registering\n", cls->name); - - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) - return -ENOMEM; - klist_init(&cp->class_devices, klist_class_dev_get, klist_class_dev_put); - INIT_LIST_HEAD(&cp->class_interfaces); - kset_init(&cp->class_dirs); - __mutex_init(&cp->class_mutex, "struct class mutex", key); - error = kobject_set_name(&cp->class_subsys.kobj, "%s", cls->name); - if (error) { - kfree(cp); - return error; - } - - /* set the default /sys/dev directory for devices of this class */ - if (!cls->dev_kobj) - cls->dev_kobj = sysfs_dev_char_kobj; - -#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK) && !defined(DDE_LINUX) - /* let the block class directory show up in the root of sysfs */ - if (cls != &block_class) - cp->class_subsys.kobj.kset = class_kset; -#else - cp->class_subsys.kobj.kset = class_kset; -#endif - cp->class_subsys.kobj.ktype = &class_ktype; - cp->class = cls; - cls->p = cp; - - error = kset_register(&cp->class_subsys); - if (error) { - kfree(cp); - return error; - } - error = add_class_attrs(class_get(cls)); - class_put(cls); - return error; -} -EXPORT_SYMBOL_GPL(__class_register); - -void class_unregister(struct class *cls) -{ - pr_debug("device class '%s': unregistering\n", cls->name); - remove_class_attrs(cls); - kset_unregister(&cls->p->class_subsys); -} - -static void class_create_release(struct class *cls) -{ - pr_debug("%s called for %s\n", __func__, cls->name); - kfree(cls); -} - -/** - * class_create - create a struct class structure - * @owner: pointer to the module that is to "own" this struct class - * @name: pointer to a string for the name of this class. - * @key: the lock_class_key for this class; used by mutex lock debugging - * - * This is used to create a struct class pointer that can then be used - * in calls to device_create(). - * - * Note, the pointer created here is to be destroyed when finished by - * making a call to class_destroy(). - */ -struct class *__class_create(struct module *owner, const char *name, - struct lock_class_key *key) -{ - struct class *cls; - int retval; - - cls = kzalloc(sizeof(*cls), GFP_KERNEL); - if (!cls) { - retval = -ENOMEM; - goto error; - } - - cls->name = name; - cls->owner = owner; - cls->class_release = class_create_release; - - retval = __class_register(cls, key); - if (retval) - goto error; - - return cls; - -error: - kfree(cls); - return ERR_PTR(retval); -} -EXPORT_SYMBOL_GPL(__class_create); - -/** - * class_destroy - destroys a struct class structure - * @cls: pointer to the struct class that is to be destroyed - * - * Note, the pointer to be destroyed must have been created with a call - * to class_create(). - */ -void class_destroy(struct class *cls) -{ - if ((cls == NULL) || (IS_ERR(cls))) - return; - - class_unregister(cls); -} - -#ifdef CONFIG_SYSFS_DEPRECATED -char *make_class_name(const char *name, struct kobject *kobj) -{ - char *class_name; - int size; - - size = strlen(name) + strlen(kobject_name(kobj)) + 2; - - class_name = kmalloc(size, GFP_KERNEL); - if (!class_name) - return NULL; - - strcpy(class_name, name); - strcat(class_name, ":"); - strcat(class_name, kobject_name(kobj)); - return class_name; -} -#endif - -/** - * class_dev_iter_init - initialize class device iterator - * @iter: class iterator to initialize - * @class: the class we wanna iterate over - * @start: the device to start iterating from, if any - * @type: device_type of the devices to iterate over, NULL for all - * - * Initialize class iterator @iter such that it iterates over devices - * of @class. If @start is set, the list iteration will start there, - * otherwise if it is NULL, the iteration starts at the beginning of - * the list. - */ -void class_dev_iter_init(struct class_dev_iter *iter, struct class *class, - struct device *start, const struct device_type *type) -{ - struct klist_node *start_knode = NULL; - - if (start) - start_knode = &start->knode_class; - klist_iter_init_node(&class->p->class_devices, &iter->ki, start_knode); - iter->type = type; -} -EXPORT_SYMBOL_GPL(class_dev_iter_init); - -/** - * class_dev_iter_next - iterate to the next device - * @iter: class iterator to proceed - * - * Proceed @iter to the next device and return it. Returns NULL if - * iteration is complete. - * - * The returned device is referenced and won't be released till - * iterator is proceed to the next device or exited. The caller is - * free to do whatever it wants to do with the device including - * calling back into class code. - */ -struct device *class_dev_iter_next(struct class_dev_iter *iter) -{ - struct klist_node *knode; - struct device *dev; - - while (1) { - knode = klist_next(&iter->ki); - if (!knode) - return NULL; - dev = container_of(knode, struct device, knode_class); - if (!iter->type || iter->type == dev->type) - return dev; - } -} -EXPORT_SYMBOL_GPL(class_dev_iter_next); - -/** - * class_dev_iter_exit - finish iteration - * @iter: class iterator to finish - * - * Finish an iteration. Always call this function after iteration is - * complete whether the iteration ran till the end or not. - */ -void class_dev_iter_exit(struct class_dev_iter *iter) -{ - klist_iter_exit(&iter->ki); -} -EXPORT_SYMBOL_GPL(class_dev_iter_exit); - -/** - * class_for_each_device - device iterator - * @class: the class we're iterating - * @start: the device to start with in the list, if any. - * @data: data for the callback - * @fn: function to be called for each device - * - * Iterate over @class's list of devices, and call @fn for each, - * passing it @data. If @start is set, the list iteration will start - * there, otherwise if it is NULL, the iteration starts at the - * beginning of the list. - * - * We check the return of @fn each time. If it returns anything - * other than 0, we break out and return that value. - * - * @fn is allowed to do anything including calling back into class - * code. There's no locking restriction. - */ -int class_for_each_device(struct class *class, struct device *start, - void *data, int (*fn)(struct device *, void *)) -{ - struct class_dev_iter iter; - struct device *dev; - int error = 0; - - if (!class) - return -EINVAL; - if (!class->p) { - WARN(1, "%s called for class '%s' before it was initialized", - __func__, class->name); - return -EINVAL; - } - - class_dev_iter_init(&iter, class, start, NULL); - while ((dev = class_dev_iter_next(&iter))) { - error = fn(dev, data); - if (error) - break; - } - class_dev_iter_exit(&iter); - - return error; -} -EXPORT_SYMBOL_GPL(class_for_each_device); - -/** - * class_find_device - device iterator for locating a particular device - * @class: the class we're iterating - * @start: Device to begin with - * @data: data for the match function - * @match: function to check device - * - * This is similar to the class_for_each_dev() function above, but it - * returns a reference to a device that is 'found' for later use, as - * determined by the @match callback. - * - * The callback should return 0 if the device doesn't match and non-zero - * if it does. If the callback returns non-zero, this function will - * return to the caller and not iterate over any more devices. - * - * Note, you will need to drop the reference with put_device() after use. - * - * @fn is allowed to do anything including calling back into class - * code. There's no locking restriction. - */ -struct device *class_find_device(struct class *class, struct device *start, - void *data, - int (*match)(struct device *, void *)) -{ - struct class_dev_iter iter; - struct device *dev; - - if (!class) - return NULL; - if (!class->p) { - WARN(1, "%s called for class '%s' before it was initialized", - __func__, class->name); - return NULL; - } - - class_dev_iter_init(&iter, class, start, NULL); - while ((dev = class_dev_iter_next(&iter))) { - if (match(dev, data)) { - get_device(dev); - break; - } - } - class_dev_iter_exit(&iter); - - return dev; -} -EXPORT_SYMBOL_GPL(class_find_device); - -int class_interface_register(struct class_interface *class_intf) -{ - struct class *parent; - struct class_dev_iter iter; - struct device *dev; - - if (!class_intf || !class_intf->class) - return -ENODEV; - - parent = class_get(class_intf->class); - if (!parent) - return -EINVAL; - - mutex_lock(&parent->p->class_mutex); - list_add_tail(&class_intf->node, &parent->p->class_interfaces); - if (class_intf->add_dev) { - class_dev_iter_init(&iter, parent, NULL, NULL); - while ((dev = class_dev_iter_next(&iter))) - class_intf->add_dev(dev, class_intf); - class_dev_iter_exit(&iter); - } - mutex_unlock(&parent->p->class_mutex); - - return 0; -} - -void class_interface_unregister(struct class_interface *class_intf) -{ - struct class *parent = class_intf->class; - struct class_dev_iter iter; - struct device *dev; - - if (!parent) - return; - - mutex_lock(&parent->p->class_mutex); - list_del_init(&class_intf->node); - if (class_intf->remove_dev) { - class_dev_iter_init(&iter, parent, NULL, NULL); - while ((dev = class_dev_iter_next(&iter))) - class_intf->remove_dev(dev, class_intf); - class_dev_iter_exit(&iter); - } - mutex_unlock(&parent->p->class_mutex); - - class_put(parent); -} - -int __init classes_init(void) -{ - class_kset = kset_create_and_add("class", NULL, NULL); - if (!class_kset) - return -ENOMEM; - return 0; -} - -EXPORT_SYMBOL_GPL(class_create_file); -EXPORT_SYMBOL_GPL(class_remove_file); -EXPORT_SYMBOL_GPL(class_unregister); -EXPORT_SYMBOL_GPL(class_destroy); - -EXPORT_SYMBOL_GPL(class_interface_register); -EXPORT_SYMBOL_GPL(class_interface_unregister); diff --git a/libdde_linux26/lib/src/drivers/base/.svn/text-base/core.c.svn-base b/libdde_linux26/lib/src/drivers/base/.svn/text-base/core.c.svn-base deleted file mode 100644 index e3800714..00000000 --- a/libdde_linux26/lib/src/drivers/base/.svn/text-base/core.c.svn-base +++ /dev/null @@ -1,1633 +0,0 @@ -/* - * drivers/base/core.c - core driver model code (device registration, etc) - * - * Copyright (c) 2002-3 Patrick Mochel - * Copyright (c) 2002-3 Open Source Development Labs - * Copyright (c) 2006 Greg Kroah-Hartman <gregkh@suse.de> - * Copyright (c) 2006 Novell, Inc. - * - * This file is released under the GPLv2 - * - */ - -#include <linux/device.h> -#include <linux/err.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/kdev_t.h> -#include <linux/notifier.h> -#include <linux/genhd.h> -#include <linux/kallsyms.h> -#include <linux/semaphore.h> -#include <linux/mutex.h> - -#include "base.h" -#include "power/power.h" - -int (*platform_notify)(struct device *dev) = NULL; -int (*platform_notify_remove)(struct device *dev) = NULL; -static struct kobject *dev_kobj; -struct kobject *sysfs_dev_char_kobj; -struct kobject *sysfs_dev_block_kobj; - -#ifdef DDE_LINUX -#include "local.h" -#endif - -#if defined(CONFIG_BLOCK) && !defined(DDE_LINUX) -static inline int device_is_not_partition(struct device *dev) -{ - return !(dev->type == &part_type); -} -#else -static inline int device_is_not_partition(struct device *dev) -{ - return 1; -} -#endif - -/** - * dev_driver_string - Return a device's driver name, if at all possible - * @dev: struct device to get the name of - * - * Will return the device's driver's name if it is bound to a device. If - * the device is not bound to a device, it will return the name of the bus - * it is attached to. If it is not attached to a bus either, an empty - * string will be returned. - */ -const char *dev_driver_string(const struct device *dev) -{ - return dev->driver ? dev->driver->name : - (dev->bus ? dev->bus->name : - (dev->class ? dev->class->name : "")); -} -EXPORT_SYMBOL(dev_driver_string); - -#define to_dev(obj) container_of(obj, struct device, kobj) -#define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr) - -static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct device_attribute *dev_attr = to_dev_attr(attr); - struct device *dev = to_dev(kobj); - ssize_t ret = -EIO; - - if (dev_attr->show) - ret = dev_attr->show(dev, dev_attr, buf); - if (ret >= (ssize_t)PAGE_SIZE) { - print_symbol("dev_attr_show: %s returned bad count\n", - (unsigned long)dev_attr->show); - } - return ret; -} - -static ssize_t dev_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t count) -{ - struct device_attribute *dev_attr = to_dev_attr(attr); - struct device *dev = to_dev(kobj); - ssize_t ret = -EIO; - - if (dev_attr->store) - ret = dev_attr->store(dev, dev_attr, buf, count); - return ret; -} - -static struct sysfs_ops dev_sysfs_ops = { - .show = dev_attr_show, - .store = dev_attr_store, -}; - - -/** - * device_release - free device structure. - * @kobj: device's kobject. - * - * This is called once the reference count for the object - * reaches 0. We forward the call to the device's release - * method, which should handle actually freeing the structure. - */ -static void device_release(struct kobject *kobj) -{ - struct device *dev = to_dev(kobj); - - if (dev->release) - dev->release(dev); - else if (dev->type && dev->type->release) - dev->type->release(dev); - else if (dev->class && dev->class->dev_release) - dev->class->dev_release(dev); - else - WARN(1, KERN_ERR "Device '%s' does not have a release() " - "function, it is broken and must be fixed.\n", - dev_name(dev)); -} - -static struct kobj_type device_ktype = { - .release = device_release, - .sysfs_ops = &dev_sysfs_ops, -}; - - -static int dev_uevent_filter(struct kset *kset, struct kobject *kobj) -{ - struct kobj_type *ktype = get_ktype(kobj); - - if (ktype == &device_ktype) { - struct device *dev = to_dev(kobj); - if (dev->uevent_suppress) - return 0; - if (dev->bus) - return 1; - if (dev->class) - return 1; - } - return 0; -} - -static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj) -{ - struct device *dev = to_dev(kobj); - - if (dev->bus) - return dev->bus->name; - if (dev->class) - return dev->class->name; - return NULL; -} - -static int dev_uevent(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env) -{ - struct device *dev = to_dev(kobj); - int retval = 0; - -#ifndef DDE_LINUX - /* add the major/minor if present */ - if (MAJOR(dev->devt)) { - add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt)); - add_uevent_var(env, "MINOR=%u", MINOR(dev->devt)); - } - - if (dev->type && dev->type->name) - add_uevent_var(env, "DEVTYPE=%s", dev->type->name); - - if (dev->driver) - add_uevent_var(env, "DRIVER=%s", dev->driver->name); - -#ifdef CONFIG_SYSFS_DEPRECATED - if (dev->class) { - struct device *parent = dev->parent; - - /* find first bus device in parent chain */ - while (parent && !parent->bus) - parent = parent->parent; - if (parent && parent->bus) { - const char *path; - - path = kobject_get_path(&parent->kobj, GFP_KERNEL); - if (path) { - add_uevent_var(env, "PHYSDEVPATH=%s", path); - kfree(path); - } - - add_uevent_var(env, "PHYSDEVBUS=%s", parent->bus->name); - - if (parent->driver) - add_uevent_var(env, "PHYSDEVDRIVER=%s", - parent->driver->name); - } - } else if (dev->bus) { - add_uevent_var(env, "PHYSDEVBUS=%s", dev->bus->name); - - if (dev->driver) - add_uevent_var(env, "PHYSDEVDRIVER=%s", - dev->driver->name); - } -#endif - - /* have the bus specific function add its stuff */ - if (dev->bus && dev->bus->uevent) { - retval = dev->bus->uevent(dev, env); - if (retval) - pr_debug("device: '%s': %s: bus uevent() returned %d\n", - dev_name(dev), __func__, retval); - } - - /* have the class specific function add its stuff */ - if (dev->class && dev->class->dev_uevent) { - retval = dev->class->dev_uevent(dev, env); - if (retval) - pr_debug("device: '%s': %s: class uevent() " - "returned %d\n", dev_name(dev), - __func__, retval); - } - - /* have the device type specific fuction add its stuff */ - if (dev->type && dev->type->uevent) { - retval = dev->type->uevent(dev, env); - if (retval) - pr_debug("device: '%s': %s: dev_type uevent() " - "returned %d\n", dev_name(dev), - __func__, retval); - } -#endif - - return retval; -} - -static struct kset_uevent_ops device_uevent_ops = { - .filter = dev_uevent_filter, - .name = dev_uevent_name, - .uevent = dev_uevent, -}; - -static ssize_t show_uevent(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct kobject *top_kobj; - struct kset *kset; - struct kobj_uevent_env *env = NULL; - int i; - size_t count = 0; - int retval; - - /* search the kset, the device belongs to */ - top_kobj = &dev->kobj; - while (!top_kobj->kset && top_kobj->parent) - top_kobj = top_kobj->parent; - if (!top_kobj->kset) - goto out; - - kset = top_kobj->kset; - if (!kset->uevent_ops || !kset->uevent_ops->uevent) - goto out; - - /* respect filter */ - if (kset->uevent_ops && kset->uevent_ops->filter) - if (!kset->uevent_ops->filter(kset, &dev->kobj)) - goto out; - - env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); - if (!env) - return -ENOMEM; - - /* let the kset specific function add its keys */ - retval = kset->uevent_ops->uevent(kset, &dev->kobj, env); - if (retval) - goto out; - - /* copy keys to file */ - for (i = 0; i < env->envp_idx; i++) - count += sprintf(&buf[count], "%s\n", env->envp[i]); -out: - kfree(env); - return count; -} - -static ssize_t store_uevent(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - enum kobject_action action; - - if (kobject_action_type(buf, count, &action) == 0) { - kobject_uevent(&dev->kobj, action); - goto out; - } - - dev_err(dev, "uevent: unsupported action-string; this will " - "be ignored in a future kernel version\n"); - kobject_uevent(&dev->kobj, KOBJ_ADD); -out: - return count; -} - -static struct device_attribute uevent_attr = - __ATTR(uevent, S_IRUGO | S_IWUSR, show_uevent, store_uevent); - -static int device_add_attributes(struct device *dev, - struct device_attribute *attrs) -{ - int error = 0; - int i; - - if (attrs) { - for (i = 0; attr_name(attrs[i]); i++) { - error = device_create_file(dev, &attrs[i]); - if (error) - break; - } - if (error) - while (--i >= 0) - device_remove_file(dev, &attrs[i]); - } - return error; -} - -static void device_remove_attributes(struct device *dev, - struct device_attribute *attrs) -{ - int i; - - if (attrs) - for (i = 0; attr_name(attrs[i]); i++) - device_remove_file(dev, &attrs[i]); -} - -static int device_add_groups(struct device *dev, - struct attribute_group **groups) -{ - int error = 0; - int i; - - if (groups) { - for (i = 0; groups[i]; i++) { - error = sysfs_create_group(&dev->kobj, groups[i]); - if (error) { - while (--i >= 0) - sysfs_remove_group(&dev->kobj, - groups[i]); - break; - } - } - } - return error; -} - -static void device_remove_groups(struct device *dev, - struct attribute_group **groups) -{ - int i; - - if (groups) - for (i = 0; groups[i]; i++) - sysfs_remove_group(&dev->kobj, groups[i]); -} - -static int device_add_attrs(struct device *dev) -{ - struct class *class = dev->class; - struct device_type *type = dev->type; - int error; - - if (class) { - error = device_add_attributes(dev, class->dev_attrs); - if (error) - return error; - } - - if (type) { - error = device_add_groups(dev, type->groups); - if (error) - goto err_remove_class_attrs; - } - - error = device_add_groups(dev, dev->groups); - if (error) - goto err_remove_type_groups; - - return 0; - - err_remove_type_groups: - if (type) - device_remove_groups(dev, type->groups); - err_remove_class_attrs: - if (class) - device_remove_attributes(dev, class->dev_attrs); - - return error; -} - -static void device_remove_attrs(struct device *dev) -{ - struct class *class = dev->class; - struct device_type *type = dev->type; - - device_remove_groups(dev, dev->groups); - - if (type) - device_remove_groups(dev, type->groups); - - if (class) - device_remove_attributes(dev, class->dev_attrs); -} - - -static ssize_t show_dev(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return print_dev_t(buf, dev->devt); -} - -static struct device_attribute devt_attr = - __ATTR(dev, S_IRUGO, show_dev, NULL); - -/* kset to create /sys/devices/ */ -struct kset *devices_kset; - -/** - * device_create_file - create sysfs attribute file for device. - * @dev: device. - * @attr: device attribute descriptor. - */ -int device_create_file(struct device *dev, struct device_attribute *attr) -{ - int error = 0; - if (dev) - error = sysfs_create_file(&dev->kobj, &attr->attr); - return error; -} - -/** - * device_remove_file - remove sysfs attribute file. - * @dev: device. - * @attr: device attribute descriptor. - */ -void device_remove_file(struct device *dev, struct device_attribute *attr) -{ - if (dev) - sysfs_remove_file(&dev->kobj, &attr->attr); -} - -/** - * device_create_bin_file - create sysfs binary attribute file for device. - * @dev: device. - * @attr: device binary attribute descriptor. - */ -int device_create_bin_file(struct device *dev, struct bin_attribute *attr) -{ - int error = -EINVAL; - if (dev) - error = sysfs_create_bin_file(&dev->kobj, attr); - return error; -} -EXPORT_SYMBOL_GPL(device_create_bin_file); - -/** - * device_remove_bin_file - remove sysfs binary attribute file - * @dev: device. - * @attr: device binary attribute descriptor. - */ -void device_remove_bin_file(struct device *dev, struct bin_attribute *attr) -{ - if (dev) - sysfs_remove_bin_file(&dev->kobj, attr); -} -EXPORT_SYMBOL_GPL(device_remove_bin_file); - -/** - * device_schedule_callback_owner - helper to schedule a callback for a device - * @dev: device. - * @func: callback function to invoke later. - * @owner: module owning the callback routine - * - * Attribute methods must not unregister themselves or their parent device - * (which would amount to the same thing). Attempts to do so will deadlock, - * since unregistration is mutually exclusive with driver callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @dev as its - * argument in the workqueue's process context. @dev will be pinned until - * @func returns. - * - * This routine is usually called via the inline device_schedule_callback(), - * which automatically sets @owner to THIS_MODULE. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available. - * - * NOTE: This routine won't work if CONFIG_SYSFS isn't set! It uses an - * underlying sysfs routine (since it is intended for use by attribute - * methods), and if sysfs isn't available you'll get nothing but -ENOSYS. - */ -int device_schedule_callback_owner(struct device *dev, - void (*func)(struct device *), struct module *owner) -{ - return sysfs_schedule_callback(&dev->kobj, - (void (*)(void *)) func, dev, owner); -} -EXPORT_SYMBOL_GPL(device_schedule_callback_owner); - -static void klist_children_get(struct klist_node *n) -{ - struct device *dev = container_of(n, struct device, knode_parent); - - get_device(dev); -} - -static void klist_children_put(struct klist_node *n) -{ - struct device *dev = container_of(n, struct device, knode_parent); - - put_device(dev); -} - -/** - * device_initialize - init device structure. - * @dev: device. - * - * This prepares the device for use by other layers by initializing - * its fields. - * It is the first half of device_register(), if called by - * that function, though it can also be called separately, so one - * may use @dev's fields. In particular, get_device()/put_device() - * may be used for reference counting of @dev after calling this - * function. - * - * NOTE: Use put_device() to give up your reference instead of freeing - * @dev directly once you have called this function. - */ -void device_initialize(struct device *dev) -{ - dev->kobj.kset = devices_kset; - kobject_init(&dev->kobj, &device_ktype); - klist_init(&dev->klist_children, klist_children_get, - klist_children_put); - INIT_LIST_HEAD(&dev->dma_pools); - init_MUTEX(&dev->sem); - spin_lock_init(&dev->devres_lock); - INIT_LIST_HEAD(&dev->devres_head); - device_init_wakeup(dev, 0); - device_pm_init(dev); - set_dev_node(dev, -1); -} - -#ifdef CONFIG_SYSFS_DEPRECATED -static struct kobject *get_device_parent(struct device *dev, - struct device *parent) -{ - /* class devices without a parent live in /sys/class/<classname>/ */ - if (dev->class && (!parent || parent->class != dev->class)) - return &dev->class->p->class_subsys.kobj; - /* all other devices keep their parent */ - else if (parent) - return &parent->kobj; - - return NULL; -} - -static inline void cleanup_device_parent(struct device *dev) {} -static inline void cleanup_glue_dir(struct device *dev, - struct kobject *glue_dir) {} -#else -static struct kobject *virtual_device_parent(struct device *dev) -{ - static struct kobject *virtual_dir = NULL; - - if (!virtual_dir) - virtual_dir = kobject_create_and_add("virtual", - &devices_kset->kobj); - - return virtual_dir; -} - -static struct kobject *get_device_parent(struct device *dev, - struct device *parent) -{ - int retval; - - if (dev->class) { - struct kobject *kobj = NULL; - struct kobject *parent_kobj; - struct kobject *k; - - /* - * If we have no parent, we live in "virtual". - * Class-devices with a non class-device as parent, live - * in a "glue" directory to prevent namespace collisions. - */ - if (parent == NULL) - parent_kobj = virtual_device_parent(dev); - else if (parent->class) - return &parent->kobj; - else - parent_kobj = &parent->kobj; - - /* find our class-directory at the parent and reference it */ - spin_lock(&dev->class->p->class_dirs.list_lock); - list_for_each_entry(k, &dev->class->p->class_dirs.list, entry) - if (k->parent == parent_kobj) { - kobj = kobject_get(k); - break; - } - spin_unlock(&dev->class->p->class_dirs.list_lock); - if (kobj) - return kobj; - - /* or create a new class-directory at the parent device */ - k = kobject_create(); - if (!k) - return NULL; - k->kset = &dev->class->p->class_dirs; - retval = kobject_add(k, parent_kobj, "%s", dev->class->name); - if (retval < 0) { - kobject_put(k); - return NULL; - } - /* do not emit an uevent for this simple "glue" directory */ - return k; - } - - if (parent) - return &parent->kobj; - return NULL; -} - -static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir) -{ - /* see if we live in a "glue" directory */ - if (!glue_dir || !dev->class || - glue_dir->kset != &dev->class->p->class_dirs) - return; - - kobject_put(glue_dir); -} - -static void cleanup_device_parent(struct device *dev) -{ - cleanup_glue_dir(dev, dev->kobj.parent); -} -#endif - -static void setup_parent(struct device *dev, struct device *parent) -{ - struct kobject *kobj; - kobj = get_device_parent(dev, parent); - if (kobj) - dev->kobj.parent = kobj; -} - -static int device_add_class_symlinks(struct device *dev) -{ - int error; - - if (!dev->class) - return 0; - - error = sysfs_create_link(&dev->kobj, - &dev->class->p->class_subsys.kobj, - "subsystem"); - if (error) - goto out; - -#ifdef CONFIG_SYSFS_DEPRECATED - /* stacked class devices need a symlink in the class directory */ - if (dev->kobj.parent != &dev->class->p->class_subsys.kobj && - device_is_not_partition(dev)) { - error = sysfs_create_link(&dev->class->p->class_subsys.kobj, - &dev->kobj, dev_name(dev)); - if (error) - goto out_subsys; - } - - if (dev->parent && device_is_not_partition(dev)) { - struct device *parent = dev->parent; - char *class_name; - - /* - * stacked class devices have the 'device' link - * pointing to the bus device instead of the parent - */ - while (parent->class && !parent->bus && parent->parent) - parent = parent->parent; - - error = sysfs_create_link(&dev->kobj, - &parent->kobj, - "device"); - if (error) - goto out_busid; - - class_name = make_class_name(dev->class->name, - &dev->kobj); - if (class_name) - error = sysfs_create_link(&dev->parent->kobj, - &dev->kobj, class_name); - kfree(class_name); - if (error) - goto out_device; - } - return 0; - -out_device: - if (dev->parent && device_is_not_partition(dev)) - sysfs_remove_link(&dev->kobj, "device"); -out_busid - if (dev->kobj.parent != &dev->class->p->class_subsys.kobj && - device_is_not_partition(dev)) - sysfs_remove_link(&dev->class->p->class_subsys.kobj, - dev_name(dev)); -#else - /* link in the class directory pointing to the device */ - error = sysfs_create_link(&dev->class->p->class_subsys.kobj, - &dev->kobj, dev_name(dev)); - if (error) - goto out_subsys; - - if (dev->parent && device_is_not_partition(dev)) { - error = sysfs_create_link(&dev->kobj, &dev->parent->kobj, - "device"); - if (error) - goto out_busid; - } - return 0; - -out_busid: - sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev)); -#endif - -out_subsys: - sysfs_remove_link(&dev->kobj, "subsystem"); -out: - return error; -} - -static void device_remove_class_symlinks(struct device *dev) -{ - if (!dev->class) - return; - -#ifdef CONFIG_SYSFS_DEPRECATED - if (dev->parent && device_is_not_partition(dev)) { - char *class_name; - - class_name = make_class_name(dev->class->name, &dev->kobj); - if (class_name) { - sysfs_remove_link(&dev->parent->kobj, class_name); - kfree(class_name); - } - sysfs_remove_link(&dev->kobj, "device"); - } - - if (dev->kobj.parent != &dev->class->p->class_subsys.kobj && - device_is_not_partition(dev)) - sysfs_remove_link(&dev->class->p->class_subsys.kobj, - dev_name(dev)); -#else - if (dev->parent && device_is_not_partition(dev)) - sysfs_remove_link(&dev->kobj, "device"); - - sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev)); -#endif - - sysfs_remove_link(&dev->kobj, "subsystem"); -} - -/** - * dev_set_name - set a device name - * @dev: device - * @fmt: format string for the device's name - */ -int dev_set_name(struct device *dev, const char *fmt, ...) -{ - va_list vargs; - char *s; - - va_start(vargs, fmt); - vsnprintf(dev->bus_id, sizeof(dev->bus_id), fmt, vargs); - va_end(vargs); - - /* ewww... some of these buggers have / in the name... */ - while ((s = strchr(dev->bus_id, '/'))) - *s = '!'; - - return 0; -} -EXPORT_SYMBOL_GPL(dev_set_name); - -/** - * device_to_dev_kobj - select a /sys/dev/ directory for the device - * @dev: device - * - * By default we select char/ for new entries. Setting class->dev_obj - * to NULL prevents an entry from being created. class->dev_kobj must - * be set (or cleared) before any devices are registered to the class - * otherwise device_create_sys_dev_entry() and - * device_remove_sys_dev_entry() will disagree about the the presence - * of the link. - */ -static struct kobject *device_to_dev_kobj(struct device *dev) -{ - struct kobject *kobj; - - if (dev->class) - kobj = dev->class->dev_kobj; - else - kobj = sysfs_dev_char_kobj; - - return kobj; -} - -static int device_create_sys_dev_entry(struct device *dev) -{ - struct kobject *kobj = device_to_dev_kobj(dev); - int error = 0; - char devt_str[15]; - - if (kobj) { - format_dev_t(devt_str, dev->devt); - error = sysfs_create_link(kobj, &dev->kobj, devt_str); - } - - return error; -} - -static void device_remove_sys_dev_entry(struct device *dev) -{ - struct kobject *kobj = device_to_dev_kobj(dev); - char devt_str[15]; - - if (kobj) { - format_dev_t(devt_str, dev->devt); - sysfs_remove_link(kobj, devt_str); - } -} - -/** - * device_add - add device to device hierarchy. - * @dev: device. - * - * This is part 2 of device_register(), though may be called - * separately _iff_ device_initialize() has been called separately. - * - * This adds @dev to the kobject hierarchy via kobject_add(), adds it - * to the global and sibling lists for the device, then - * adds it to the other relevant subsystems of the driver model. - * - * NOTE: _Never_ directly free @dev after calling this function, even - * if it returned an error! Always use put_device() to give up your - * reference instead. - */ -int device_add(struct device *dev) -{ - struct device *parent = NULL; - struct class_interface *class_intf; - int error = -EINVAL; - - dev = get_device(dev); - if (!dev) - goto done; - - /* Temporarily support init_name if it is set. - * It will override bus_id for now */ - if (dev->init_name) - dev_set_name(dev, "%s", dev->init_name); - - if (!strlen(dev->bus_id)) - goto done; - - pr_debug("device: '%s': %s\n", dev_name(dev), __func__); - - parent = get_device(dev->parent); - setup_parent(dev, parent); - - /* use parent numa_node */ - if (parent) - set_dev_node(dev, dev_to_node(parent)); - - /* first, register with generic layer. */ - error = kobject_add(&dev->kobj, dev->kobj.parent, "%s", dev_name(dev)); - if (error) - goto Error; - - /* notify platform of device entry */ - if (platform_notify) - platform_notify(dev); - - error = device_create_file(dev, &uevent_attr); - if (error) - goto attrError; - - if (MAJOR(dev->devt)) { - error = device_create_file(dev, &devt_attr); - if (error) - goto ueventattrError; - - error = device_create_sys_dev_entry(dev); - if (error) - goto devtattrError; - } - - error = device_add_class_symlinks(dev); - if (error) - goto SymlinkError; - error = device_add_attrs(dev); - if (error) - goto AttrsError; - error = bus_add_device(dev); - if (error) - goto BusError; - error = dpm_sysfs_add(dev); - if (error) - goto DPMError; - device_pm_add(dev); - - /* Notify clients of device addition. This call must come - * after dpm_sysf_add() and before kobject_uevent(). - */ - if (dev->bus) - blocking_notifier_call_chain(&dev->bus->p->bus_notifier, - BUS_NOTIFY_ADD_DEVICE, dev); - - kobject_uevent(&dev->kobj, KOBJ_ADD); - bus_attach_device(dev); - if (parent) - klist_add_tail(&dev->knode_parent, &parent->klist_children); - - if (dev->class) { - mutex_lock(&dev->class->p->class_mutex); - /* tie the class to the device */ - klist_add_tail(&dev->knode_class, - &dev->class->p->class_devices); - - /* notify any interfaces that the device is here */ - list_for_each_entry(class_intf, - &dev->class->p->class_interfaces, node) - if (class_intf->add_dev) - class_intf->add_dev(dev, class_intf); - mutex_unlock(&dev->class->p->class_mutex); - } -done: - put_device(dev); - return error; - DPMError: - bus_remove_device(dev); - BusError: - device_remove_attrs(dev); - AttrsError: - device_remove_class_symlinks(dev); - SymlinkError: - if (MAJOR(dev->devt)) - device_remove_sys_dev_entry(dev); - devtattrError: - if (MAJOR(dev->devt)) - device_remove_file(dev, &devt_attr); - ueventattrError: - device_remove_file(dev, &uevent_attr); - attrError: - kobject_uevent(&dev->kobj, KOBJ_REMOVE); - kobject_del(&dev->kobj); - Error: - cleanup_device_parent(dev); - if (parent) - put_device(parent); - goto done; -} - -/** - * device_register - register a device with the system. - * @dev: pointer to the device structure - * - * This happens in two clean steps - initialize the device - * and add it to the system. The two steps can be called - * separately, but this is the easiest and most common. - * I.e. you should only call the two helpers separately if - * have a clearly defined need to use and refcount the device - * before it is added to the hierarchy. - * - * NOTE: _Never_ directly free @dev after calling this function, even - * if it returned an error! Always use put_device() to give up the - * reference initialized in this function instead. - */ -int device_register(struct device *dev) -{ - device_initialize(dev); - return device_add(dev); -} - -/** - * get_device - increment reference count for device. - * @dev: device. - * - * This simply forwards the call to kobject_get(), though - * we do take care to provide for the case that we get a NULL - * pointer passed in. - */ -struct device *get_device(struct device *dev) -{ - return dev ? to_dev(kobject_get(&dev->kobj)) : NULL; -} - -/** - * put_device - decrement reference count. - * @dev: device in question. - */ -void put_device(struct device *dev) -{ - /* might_sleep(); */ - if (dev) - kobject_put(&dev->kobj); -} - -/** - * device_del - delete device from system. - * @dev: device. - * - * This is the first part of the device unregistration - * sequence. This removes the device from the lists we control - * from here, has it removed from the other driver model - * subsystems it was added to in device_add(), and removes it - * from the kobject hierarchy. - * - * NOTE: this should be called manually _iff_ device_add() was - * also called manually. - */ -void device_del(struct device *dev) -{ - struct device *parent = dev->parent; - struct class_interface *class_intf; - - /* Notify clients of device removal. This call must come - * before dpm_sysfs_remove(). - */ - if (dev->bus) - blocking_notifier_call_chain(&dev->bus->p->bus_notifier, - BUS_NOTIFY_DEL_DEVICE, dev); - device_pm_remove(dev); - dpm_sysfs_remove(dev); - if (parent) - klist_del(&dev->knode_parent); - if (MAJOR(dev->devt)) { - device_remove_sys_dev_entry(dev); - device_remove_file(dev, &devt_attr); - } - if (dev->class) { - device_remove_class_symlinks(dev); - - mutex_lock(&dev->class->p->class_mutex); - /* notify any interfaces that the device is now gone */ - list_for_each_entry(class_intf, - &dev->class->p->class_interfaces, node) - if (class_intf->remove_dev) - class_intf->remove_dev(dev, class_intf); - /* remove the device from the class list */ - klist_del(&dev->knode_class); - mutex_unlock(&dev->class->p->class_mutex); - } - device_remove_file(dev, &uevent_attr); - device_remove_attrs(dev); - bus_remove_device(dev); - - /* - * Some platform devices are driven without driver attached - * and managed resources may have been acquired. Make sure - * all resources are released. - */ - devres_release_all(dev); - - /* Notify the platform of the removal, in case they - * need to do anything... - */ - if (platform_notify_remove) - platform_notify_remove(dev); - kobject_uevent(&dev->kobj, KOBJ_REMOVE); - cleanup_device_parent(dev); - kobject_del(&dev->kobj); - put_device(parent); -} - -/** - * device_unregister - unregister device from system. - * @dev: device going away. - * - * We do this in two parts, like we do device_register(). First, - * we remove it from all the subsystems with device_del(), then - * we decrement the reference count via put_device(). If that - * is the final reference count, the device will be cleaned up - * via device_release() above. Otherwise, the structure will - * stick around until the final reference to the device is dropped. - */ -void device_unregister(struct device *dev) -{ - pr_debug("device: '%s': %s\n", dev_name(dev), __func__); - device_del(dev); - put_device(dev); -} - -static struct device *next_device(struct klist_iter *i) -{ - struct klist_node *n = klist_next(i); - return n ? container_of(n, struct device, knode_parent) : NULL; -} - -/** - * device_for_each_child - device child iterator. - * @parent: parent struct device. - * @data: data for the callback. - * @fn: function to be called for each device. - * - * Iterate over @parent's child devices, and call @fn for each, - * passing it @data. - * - * We check the return of @fn each time. If it returns anything - * other than 0, we break out and return that value. - */ -int device_for_each_child(struct device *parent, void *data, - int (*fn)(struct device *dev, void *data)) -{ - struct klist_iter i; - struct device *child; - int error = 0; - - klist_iter_init(&parent->klist_children, &i); - while ((child = next_device(&i)) && !error) - error = fn(child, data); - klist_iter_exit(&i); - return error; -} - -/** - * device_find_child - device iterator for locating a particular device. - * @parent: parent struct device - * @data: Data to pass to match function - * @match: Callback function to check device - * - * This is similar to the device_for_each_child() function above, but it - * returns a reference to a device that is 'found' for later use, as - * determined by the @match callback. - * - * The callback should return 0 if the device doesn't match and non-zero - * if it does. If the callback returns non-zero and a reference to the - * current device can be obtained, this function will return to the caller - * and not iterate over any more devices. - */ -struct device *device_find_child(struct device *parent, void *data, - int (*match)(struct device *dev, void *data)) -{ - struct klist_iter i; - struct device *child; - - if (!parent) - return NULL; - - klist_iter_init(&parent->klist_children, &i); - while ((child = next_device(&i))) - if (match(child, data) && get_device(child)) - break; - klist_iter_exit(&i); - return child; -} - -int __init devices_init(void) -{ - devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL); - if (!devices_kset) - return -ENOMEM; - dev_kobj = kobject_create_and_add("dev", NULL); - if (!dev_kobj) - goto dev_kobj_err; - sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj); - if (!sysfs_dev_block_kobj) - goto block_kobj_err; - sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj); - if (!sysfs_dev_char_kobj) - goto char_kobj_err; - - return 0; - - char_kobj_err: - kobject_put(sysfs_dev_block_kobj); - block_kobj_err: - kobject_put(dev_kobj); - dev_kobj_err: - kset_unregister(devices_kset); - return -ENOMEM; -} - -EXPORT_SYMBOL_GPL(device_for_each_child); -EXPORT_SYMBOL_GPL(device_find_child); - -EXPORT_SYMBOL_GPL(device_initialize); -EXPORT_SYMBOL_GPL(device_add); -EXPORT_SYMBOL_GPL(device_register); - -EXPORT_SYMBOL_GPL(device_del); -EXPORT_SYMBOL_GPL(device_unregister); -EXPORT_SYMBOL_GPL(get_device); -EXPORT_SYMBOL_GPL(put_device); - -EXPORT_SYMBOL_GPL(device_create_file); -EXPORT_SYMBOL_GPL(device_remove_file); - -struct root_device -{ - struct device dev; - struct module *owner; -}; - -#define to_root_device(dev) container_of(dev, struct root_device, dev) - -static void root_device_release(struct device *dev) -{ - kfree(to_root_device(dev)); -} - -/** - * __root_device_register - allocate and register a root device - * @name: root device name - * @owner: owner module of the root device, usually THIS_MODULE - * - * This function allocates a root device and registers it - * using device_register(). In order to free the returned - * device, use root_device_unregister(). - * - * Root devices are dummy devices which allow other devices - * to be grouped under /sys/devices. Use this function to - * allocate a root device and then use it as the parent of - * any device which should appear under /sys/devices/{name} - * - * The /sys/devices/{name} directory will also contain a - * 'module' symlink which points to the @owner directory - * in sysfs. - * - * Note: You probably want to use root_device_register(). - */ -struct device *__root_device_register(const char *name, struct module *owner) -{ - struct root_device *root; - int err = -ENOMEM; - - root = kzalloc(sizeof(struct root_device), GFP_KERNEL); - if (!root) - return ERR_PTR(err); - - err = dev_set_name(&root->dev, name); - if (err) { - kfree(root); - return ERR_PTR(err); - } - - root->dev.release = root_device_release; - - err = device_register(&root->dev); - if (err) { - put_device(&root->dev); - return ERR_PTR(err); - } - -#ifdef CONFIG_MODULE /* gotta find a "cleaner" way to do this */ - if (owner) { - struct module_kobject *mk = &owner->mkobj; - - err = sysfs_create_link(&root->dev.kobj, &mk->kobj, "module"); - if (err) { - device_unregister(&root->dev); - return ERR_PTR(err); - } - root->owner = owner; - } -#endif - - return &root->dev; -} -EXPORT_SYMBOL_GPL(__root_device_register); - -/** - * root_device_unregister - unregister and free a root device - * @dev: device going away - * - * This function unregisters and cleans up a device that was created by - * root_device_register(). - */ -void root_device_unregister(struct device *dev) -{ - struct root_device *root = to_root_device(dev); - - if (root->owner) - sysfs_remove_link(&root->dev.kobj, "module"); - - device_unregister(dev); -} -EXPORT_SYMBOL_GPL(root_device_unregister); - - -static void device_create_release(struct device *dev) -{ - pr_debug("device: '%s': %s\n", dev_name(dev), __func__); - kfree(dev); -} - -/** - * device_create_vargs - creates a device and registers it with sysfs - * @class: pointer to the struct class that this device should be registered to - * @parent: pointer to the parent struct device of this new device, if any - * @devt: the dev_t for the char device to be added - * @drvdata: the data to be added to the device for callbacks - * @fmt: string for the device's name - * @args: va_list for the device's name - * - * This function can be used by char device classes. A struct device - * will be created in sysfs, registered to the specified class. - * - * A "dev" file will be created, showing the dev_t for the device, if - * the dev_t is not 0,0. - * If a pointer to a parent struct device is passed in, the newly created - * struct device will be a child of that device in sysfs. - * The pointer to the struct device will be returned from the call. - * Any further sysfs files that might be required can be created using this - * pointer. - * - * Note: the struct class passed to this function must have previously - * been created with a call to class_create(). - */ -struct device *device_create_vargs(struct class *class, struct device *parent, - dev_t devt, void *drvdata, const char *fmt, - va_list args) -{ - struct device *dev = NULL; - int retval = -ENODEV; - - if (class == NULL || IS_ERR(class)) - goto error; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) { - retval = -ENOMEM; - goto error; - } - - dev->devt = devt; - dev->class = class; - dev->parent = parent; - dev->release = device_create_release; - dev_set_drvdata(dev, drvdata); - - vsnprintf(dev->bus_id, BUS_ID_SIZE, fmt, args); - retval = device_register(dev); - if (retval) - goto error; - - return dev; - -error: - put_device(dev); - return ERR_PTR(retval); -} -EXPORT_SYMBOL_GPL(device_create_vargs); - -/** - * device_create - creates a device and registers it with sysfs - * @class: pointer to the struct class that this device should be registered to - * @parent: pointer to the parent struct device of this new device, if any - * @devt: the dev_t for the char device to be added - * @drvdata: the data to be added to the device for callbacks - * @fmt: string for the device's name - * - * This function can be used by char device classes. A struct device - * will be created in sysfs, registered to the specified class. - * - * A "dev" file will be created, showing the dev_t for the device, if - * the dev_t is not 0,0. - * If a pointer to a parent struct device is passed in, the newly created - * struct device will be a child of that device in sysfs. - * The pointer to the struct device will be returned from the call. - * Any further sysfs files that might be required can be created using this - * pointer. - * - * Note: the struct class passed to this function must have previously - * been created with a call to class_create(). - */ -struct device *device_create(struct class *class, struct device *parent, - dev_t devt, void *drvdata, const char *fmt, ...) -{ - va_list vargs; - struct device *dev; - - va_start(vargs, fmt); - dev = device_create_vargs(class, parent, devt, drvdata, fmt, vargs); - va_end(vargs); - return dev; -} -EXPORT_SYMBOL_GPL(device_create); - -static int __match_devt(struct device *dev, void *data) -{ - dev_t *devt = data; - - return dev->devt == *devt; -} - -/** - * device_destroy - removes a device that was created with device_create() - * @class: pointer to the struct class that this device was registered with - * @devt: the dev_t of the device that was previously registered - * - * This call unregisters and cleans up a device that was created with a - * call to device_create(). - */ -void device_destroy(struct class *class, dev_t devt) -{ - struct device *dev; - - dev = class_find_device(class, NULL, &devt, __match_devt); - if (dev) { - put_device(dev); - device_unregister(dev); - } -} -EXPORT_SYMBOL_GPL(device_destroy); - -/** - * device_rename - renames a device - * @dev: the pointer to the struct device to be renamed - * @new_name: the new name of the device - * - * It is the responsibility of the caller to provide mutual - * exclusion between two different calls of device_rename - * on the same device to ensure that new_name is valid and - * won't conflict with other devices. - */ -int device_rename(struct device *dev, char *new_name) -{ - char *old_class_name = NULL; - char *new_class_name = NULL; - char *old_device_name = NULL; - int error; - - dev = get_device(dev); - if (!dev) - return -EINVAL; - - pr_debug("device: '%s': %s: renaming to '%s'\n", dev_name(dev), - __func__, new_name); - -#ifdef CONFIG_SYSFS_DEPRECATED - if ((dev->class) && (dev->parent)) - old_class_name = make_class_name(dev->class->name, &dev->kobj); -#endif - - old_device_name = kmalloc(BUS_ID_SIZE, GFP_KERNEL); - if (!old_device_name) { - error = -ENOMEM; - goto out; - } - strlcpy(old_device_name, dev->bus_id, BUS_ID_SIZE); - strlcpy(dev->bus_id, new_name, BUS_ID_SIZE); - - error = kobject_rename(&dev->kobj, new_name); - if (error) { - strlcpy(dev->bus_id, old_device_name, BUS_ID_SIZE); - goto out; - } - -#ifdef CONFIG_SYSFS_DEPRECATED - if (old_class_name) { - new_class_name = make_class_name(dev->class->name, &dev->kobj); - if (new_class_name) { - error = sysfs_create_link_nowarn(&dev->parent->kobj, - &dev->kobj, - new_class_name); - if (error) - goto out; - sysfs_remove_link(&dev->parent->kobj, old_class_name); - } - } -#else - if (dev->class) { - error = sysfs_create_link_nowarn(&dev->class->p->class_subsys.kobj, - &dev->kobj, dev_name(dev)); - if (error) - goto out; - sysfs_remove_link(&dev->class->p->class_subsys.kobj, - old_device_name); - } -#endif - -out: - put_device(dev); - - kfree(new_class_name); - kfree(old_class_name); - kfree(old_device_name); - - return error; -} -EXPORT_SYMBOL_GPL(device_rename); - -static int device_move_class_links(struct device *dev, - struct device *old_parent, - struct device *new_parent) -{ - int error = 0; -#ifdef CONFIG_SYSFS_DEPRECATED - char *class_name; - - class_name = make_class_name(dev->class->name, &dev->kobj); - if (!class_name) { - error = -ENOMEM; - goto out; - } - if (old_parent) { - sysfs_remove_link(&dev->kobj, "device"); - sysfs_remove_link(&old_parent->kobj, class_name); - } - if (new_parent) { - error = sysfs_create_link(&dev->kobj, &new_parent->kobj, - "device"); - if (error) - goto out; - error = sysfs_create_link(&new_parent->kobj, &dev->kobj, - class_name); - if (error) - sysfs_remove_link(&dev->kobj, "device"); - } else - error = 0; -out: - kfree(class_name); - return error; -#else - if (old_parent) - sysfs_remove_link(&dev->kobj, "device"); - if (new_parent) - error = sysfs_create_link(&dev->kobj, &new_parent->kobj, - "device"); - return error; -#endif -} - -/** - * device_move - moves a device to a new parent - * @dev: the pointer to the struct device to be moved - * @new_parent: the new parent of the device (can by NULL) - */ -int device_move(struct device *dev, struct device *new_parent) -{ - int error; - struct device *old_parent; - struct kobject *new_parent_kobj; - - dev = get_device(dev); - if (!dev) - return -EINVAL; - - new_parent = get_device(new_parent); - new_parent_kobj = get_device_parent(dev, new_parent); - - pr_debug("device: '%s': %s: moving to '%s'\n", dev_name(dev), - __func__, new_parent ? dev_name(new_parent) : "<NULL>"); - error = kobject_move(&dev->kobj, new_parent_kobj); - if (error) { - cleanup_glue_dir(dev, new_parent_kobj); - put_device(new_parent); - goto out; - } - old_parent = dev->parent; - dev->parent = new_parent; - if (old_parent) - klist_remove(&dev->knode_parent); - if (new_parent) { - klist_add_tail(&dev->knode_parent, &new_parent->klist_children); - set_dev_node(dev, dev_to_node(new_parent)); - } - - if (!dev->class) - goto out_put; - error = device_move_class_links(dev, old_parent, new_parent); - if (error) { - /* We ignore errors on cleanup since we're hosed anyway... */ - device_move_class_links(dev, new_parent, old_parent); - if (!kobject_move(&dev->kobj, &old_parent->kobj)) { - if (new_parent) - klist_remove(&dev->knode_parent); - dev->parent = old_parent; - if (old_parent) { - klist_add_tail(&dev->knode_parent, - &old_parent->klist_children); - set_dev_node(dev, dev_to_node(old_parent)); - } - } - cleanup_glue_dir(dev, new_parent_kobj); - put_device(new_parent); - goto out; - } -out_put: - put_device(old_parent); -out: - put_device(dev); - return error; -} -EXPORT_SYMBOL_GPL(device_move); - -/** - * device_shutdown - call ->shutdown() on each device to shutdown. - */ -void device_shutdown(void) -{ - struct device *dev, *devn; - - list_for_each_entry_safe_reverse(dev, devn, &devices_kset->list, - kobj.entry) { - if (dev->bus && dev->bus->shutdown) { - dev_dbg(dev, "shutdown\n"); - dev->bus->shutdown(dev); - } else if (dev->driver && dev->driver->shutdown) { - dev_dbg(dev, "shutdown\n"); - dev->driver->shutdown(dev); - } - } - kobject_put(sysfs_dev_char_kobj); - kobject_put(sysfs_dev_block_kobj); - kobject_put(dev_kobj); -} diff --git a/libdde_linux26/lib/src/drivers/base/.svn/text-base/init.c.svn-base b/libdde_linux26/lib/src/drivers/base/.svn/text-base/init.c.svn-base deleted file mode 100644 index ca5ac986..00000000 --- a/libdde_linux26/lib/src/drivers/base/.svn/text-base/init.c.svn-base +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2002-3 Patrick Mochel - * Copyright (c) 2002-3 Open Source Development Labs - * - * This file is released under the GPLv2 - */ - -#include <linux/device.h> -#include <linux/init.h> -#include <linux/memory.h> - -#include "base.h" - -/** - * driver_init - initialize driver model. - * - * Call the driver model init functions to initialize their - * subsystems. Called early from init/main.c. - */ -void __init driver_init(void) -{ - /* These are the core pieces */ - devices_init(); - buses_init(); - classes_init(); -#ifndef DDE_LINUX - firmware_init(); - hypervisor_init(); -#endif - - /* These are also core pieces, but must come after the - * core core pieces. - */ - platform_bus_init(); -#ifndef DDE_LINUX - system_bus_init(); - cpu_dev_init(); - memory_dev_init(); - attribute_container_init(); -#endif -} diff --git a/libdde_linux26/lib/src/drivers/pci/.svn/all-wcprops b/libdde_linux26/lib/src/drivers/pci/.svn/all-wcprops deleted file mode 100644 index 579ef35f..00000000 --- a/libdde_linux26/lib/src/drivers/pci/.svn/all-wcprops +++ /dev/null @@ -1,23 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 70 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/drivers/pci -END -pci-driver.c -K 25 -svn:wc:ra_dav:version-url -V 83 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/drivers/pci/pci-driver.c -END -probe.c -K 25 -svn:wc:ra_dav:version-url -V 78 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/drivers/pci/probe.c -END -pci.c -K 25 -svn:wc:ra_dav:version-url -V 76 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/drivers/pci/pci.c -END diff --git a/libdde_linux26/lib/src/drivers/pci/.svn/entries b/libdde_linux26/lib/src/drivers/pci/.svn/entries deleted file mode 100644 index ba911e04..00000000 --- a/libdde_linux26/lib/src/drivers/pci/.svn/entries +++ /dev/null @@ -1,130 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/drivers/pci -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -pci-driver.c -file - - - - -2009-11-15T17:17:13.000000Z -928cd4ba1afdac7f2758391207734dff -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -23851 - -probe.c -file - - - - -2009-11-15T17:17:13.000000Z -30ba0348e208a49904d1117852afe55f -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -32125 - -pci.c -file - - - - -2009-11-15T17:17:13.000000Z -b8e363a840fc04948d1e7f74a8de59fa -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -65407 - diff --git a/libdde_linux26/lib/src/drivers/pci/.svn/format b/libdde_linux26/lib/src/drivers/pci/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/drivers/pci/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/drivers/pci/.svn/text-base/pci-driver.c.svn-base b/libdde_linux26/lib/src/drivers/pci/.svn/text-base/pci-driver.c.svn-base deleted file mode 100644 index 199ec8a7..00000000 --- a/libdde_linux26/lib/src/drivers/pci/.svn/text-base/pci-driver.c.svn-base +++ /dev/null @@ -1,1008 +0,0 @@ -/* - * drivers/pci/pci-driver.c - * - * (C) Copyright 2002-2004, 2007 Greg Kroah-Hartman <greg@kroah.com> - * (C) Copyright 2007 Novell Inc. - * - * Released under the GPL v2 only. - * - */ - -#include <linux/pci.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/device.h> -#include <linux/mempolicy.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/sched.h> -#include <linux/cpu.h> -#include "pci.h" - -#ifdef DDE_LINUX -#include "local.h" -#endif /* DDE_LINUX */ - -/* - * Dynamic device IDs are disabled for !CONFIG_HOTPLUG - */ - -struct pci_dynid { - struct list_head node; - struct pci_device_id id; -}; - -#ifdef CONFIG_HOTPLUG - -/** - * store_new_id - add a new PCI device ID to this driver and re-probe devices - * @driver: target device driver - * @buf: buffer for scanning device ID data - * @count: input size - * - * Adds a new dynamic pci device ID to this driver, - * and causes the driver to probe for all devices again. - */ -static ssize_t -store_new_id(struct device_driver *driver, const char *buf, size_t count) -{ - struct pci_dynid *dynid; - struct pci_driver *pdrv = to_pci_driver(driver); - const struct pci_device_id *ids = pdrv->id_table; - __u32 vendor, device, subvendor=PCI_ANY_ID, - subdevice=PCI_ANY_ID, class=0, class_mask=0; - unsigned long driver_data=0; - int fields=0; - int retval=0; - - fields = sscanf(buf, "%x %x %x %x %x %x %lx", - &vendor, &device, &subvendor, &subdevice, - &class, &class_mask, &driver_data); - if (fields < 2) - return -EINVAL; - - /* Only accept driver_data values that match an existing id_table - entry */ - if (ids) { - retval = -EINVAL; - while (ids->vendor || ids->subvendor || ids->class_mask) { - if (driver_data == ids->driver_data) { - retval = 0; - break; - } - ids++; - } - if (retval) /* No match */ - return retval; - } - - dynid = kzalloc(sizeof(*dynid), GFP_KERNEL); - if (!dynid) - return -ENOMEM; - - dynid->id.vendor = vendor; - dynid->id.device = device; - dynid->id.subvendor = subvendor; - dynid->id.subdevice = subdevice; - dynid->id.class = class; - dynid->id.class_mask = class_mask; - dynid->id.driver_data = driver_data; - - spin_lock(&pdrv->dynids.lock); - list_add_tail(&dynid->node, &pdrv->dynids.list); - spin_unlock(&pdrv->dynids.lock); - - if (get_driver(&pdrv->driver)) { - retval = driver_attach(&pdrv->driver); - put_driver(&pdrv->driver); - } - - if (retval) - return retval; - return count; -} -static DRIVER_ATTR(new_id, S_IWUSR, NULL, store_new_id); - -static void -pci_free_dynids(struct pci_driver *drv) -{ - struct pci_dynid *dynid, *n; - - spin_lock(&drv->dynids.lock); - list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) { - list_del(&dynid->node); - kfree(dynid); - } - spin_unlock(&drv->dynids.lock); -} - -static int -pci_create_newid_file(struct pci_driver *drv) -{ - int error = 0; - if (drv->probe != NULL) - error = driver_create_file(&drv->driver, &driver_attr_new_id); - return error; -} - -static void pci_remove_newid_file(struct pci_driver *drv) -{ - driver_remove_file(&drv->driver, &driver_attr_new_id); -} -#else /* !CONFIG_HOTPLUG */ -static inline void pci_free_dynids(struct pci_driver *drv) {} -static inline int pci_create_newid_file(struct pci_driver *drv) -{ - return 0; -} -static inline void pci_remove_newid_file(struct pci_driver *drv) {} -#endif - -/** - * pci_match_id - See if a pci device matches a given pci_id table - * @ids: array of PCI device id structures to search in - * @dev: the PCI device structure to match against. - * - * Used by a driver to check whether a PCI device present in the - * system is in its list of supported devices. Returns the matching - * pci_device_id structure or %NULL if there is no match. - * - * Deprecated, don't use this as it will not catch any dynamic ids - * that a driver might want to check for. - */ -const struct pci_device_id *pci_match_id(const struct pci_device_id *ids, - struct pci_dev *dev) -{ - if (ids) { - while (ids->vendor || ids->subvendor || ids->class_mask) { - if (pci_match_one_device(ids, dev)) - return ids; - ids++; - } - } - return NULL; -} - -/** - * pci_match_device - Tell if a PCI device structure has a matching PCI device id structure - * @drv: the PCI driver to match against - * @dev: the PCI device structure to match against - * - * Used by a driver to check whether a PCI device present in the - * system is in its list of supported devices. Returns the matching - * pci_device_id structure or %NULL if there is no match. - */ -static const struct pci_device_id *pci_match_device(struct pci_driver *drv, - struct pci_dev *dev) -{ - struct pci_dynid *dynid; - - /* Look at the dynamic ids first, before the static ones */ - spin_lock(&drv->dynids.lock); - list_for_each_entry(dynid, &drv->dynids.list, node) { - if (pci_match_one_device(&dynid->id, dev)) { - spin_unlock(&drv->dynids.lock); - return &dynid->id; - } - } - spin_unlock(&drv->dynids.lock); - - return pci_match_id(drv->id_table, dev); -} - -struct drv_dev_and_id { - struct pci_driver *drv; - struct pci_dev *dev; - const struct pci_device_id *id; -}; - -static long local_pci_probe(void *_ddi) -{ - struct drv_dev_and_id *ddi = _ddi; - - return ddi->drv->probe(ddi->dev, ddi->id); -} - -static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, - const struct pci_device_id *id) -{ - int error, node; - struct drv_dev_and_id ddi = { drv, dev, id }; - - /* Execute driver initialization on node where the device's - bus is attached to. This way the driver likely allocates - its local memory on the right node without any need to - change it. */ - node = dev_to_node(&dev->dev); - if (node >= 0) { - int cpu; - node_to_cpumask_ptr(nodecpumask, node); - - get_online_cpus(); - cpu = cpumask_any_and(nodecpumask, cpu_online_mask); - if (cpu < nr_cpu_ids) - error = work_on_cpu(cpu, local_pci_probe, &ddi); - else - error = local_pci_probe(&ddi); - put_online_cpus(); - } else - error = local_pci_probe(&ddi); - return error; -} - -/** - * __pci_device_probe() - * @drv: driver to call to check if it wants the PCI device - * @pci_dev: PCI device being probed - * - * returns 0 on success, else error. - * side-effect: pci_dev->driver is set to drv when drv claims pci_dev. - */ -static int -__pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev) -{ - const struct pci_device_id *id; - int error = 0; - - if (!pci_dev->driver && drv->probe) { - error = -ENODEV; - - id = pci_match_device(drv, pci_dev); - if (id) - error = pci_call_probe(drv, pci_dev, id); - if (error >= 0) { - pci_dev->driver = drv; - error = 0; - } - } - return error; -} - -static int pci_device_probe(struct device * dev) -{ - int error = 0; - struct pci_driver *drv; - struct pci_dev *pci_dev; - - drv = to_pci_driver(dev->driver); - pci_dev = to_pci_dev(dev); - pci_dev_get(pci_dev); - error = __pci_device_probe(drv, pci_dev); - if (error) - pci_dev_put(pci_dev); - - return error; -} - -static int pci_device_remove(struct device * dev) -{ - struct pci_dev * pci_dev = to_pci_dev(dev); - struct pci_driver * drv = pci_dev->driver; - - if (drv) { - if (drv->remove) - drv->remove(pci_dev); - pci_dev->driver = NULL; - } - - /* - * If the device is still on, set the power state as "unknown", - * since it might change by the next time we load the driver. - */ - if (pci_dev->current_state == PCI_D0) - pci_dev->current_state = PCI_UNKNOWN; - - /* - * We would love to complain here if pci_dev->is_enabled is set, that - * the driver should have called pci_disable_device(), but the - * unfortunate fact is there are too many odd BIOS and bridge setups - * that don't like drivers doing that all of the time. - * Oh well, we can dream of sane hardware when we sleep, no matter how - * horrible the crap we have to deal with is when we are awake... - */ - - pci_dev_put(pci_dev); - return 0; -} - -static void pci_device_shutdown(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct pci_driver *drv = pci_dev->driver; - - if (drv && drv->shutdown) - drv->shutdown(pci_dev); - pci_msi_shutdown(pci_dev); - pci_msix_shutdown(pci_dev); -} - -#ifdef CONFIG_PM_SLEEP - -/* - * Default "suspend" method for devices that have no driver provided suspend, - * or not even a driver at all (second part). - */ -static void pci_pm_set_unknown_state(struct pci_dev *pci_dev) -{ - /* - * mark its power state as "unknown", since we don't know if - * e.g. the BIOS will change its device state when we suspend. - */ - if (pci_dev->current_state == PCI_D0) - pci_dev->current_state = PCI_UNKNOWN; -} - -/* - * Default "resume" method for devices that have no driver provided resume, - * or not even a driver at all (second part). - */ -static int pci_pm_reenable_device(struct pci_dev *pci_dev) -{ - int retval; - - /* if the device was enabled before suspend, reenable */ - retval = pci_reenable_device(pci_dev); - /* - * if the device was busmaster before the suspend, make it busmaster - * again - */ - if (pci_dev->is_busmaster) - pci_set_master(pci_dev); - - return retval; -} - -static int pci_legacy_suspend(struct device *dev, pm_message_t state) -{ -#ifndef DDE_LINUX - struct pci_dev * pci_dev = to_pci_dev(dev); - struct pci_driver * drv = pci_dev->driver; - int i = 0; - - if (drv && drv->suspend) { - pci_power_t prev = pci_dev->current_state; - - pci_dev->state_saved = false; - - i = drv->suspend(pci_dev, state); - suspend_report_result(drv->suspend, i); - if (i) - return i; - - if (pci_dev->state_saved) - goto Fixup; - - if (pci_dev->current_state != PCI_D0 - && pci_dev->current_state != PCI_UNKNOWN) { - WARN_ONCE(pci_dev->current_state != prev, - "PCI PM: Device state not saved by %pF\n", - drv->suspend); - goto Fixup; - } - } - - pci_save_state(pci_dev); - /* - * This is for compatibility with existing code with legacy PM support. - */ - pci_pm_set_unknown_state(pci_dev); - - Fixup: - pci_fixup_device(pci_fixup_suspend, pci_dev); - - return i; -#else - WARN_UNIMPL; - return 0; -#endif /* DDE_LINUX */ -} - -static int pci_legacy_suspend_late(struct device *dev, pm_message_t state) -{ -#ifndef DDE_LINUX - struct pci_dev * pci_dev = to_pci_dev(dev); - struct pci_driver * drv = pci_dev->driver; - int i = 0; - - if (drv && drv->suspend_late) { - i = drv->suspend_late(pci_dev, state); - suspend_report_result(drv->suspend_late, i); - } - return i; -#else - WARN_UNIMPL; - return 0; -#endif -} - -static int pci_legacy_resume_early(struct device *dev) -{ - struct pci_dev * pci_dev = to_pci_dev(dev); - struct pci_driver * drv = pci_dev->driver; - - return drv && drv->resume_early ? - drv->resume_early(pci_dev) : 0; -} - -static int pci_legacy_resume(struct device *dev) -{ - struct pci_dev * pci_dev = to_pci_dev(dev); - struct pci_driver * drv = pci_dev->driver; - - pci_fixup_device(pci_fixup_resume, pci_dev); - - return drv && drv->resume ? - drv->resume(pci_dev) : pci_pm_reenable_device(pci_dev); -} - -/* Auxiliary functions used by the new power management framework */ - -static void pci_pm_default_resume_noirq(struct pci_dev *pci_dev) -{ - pci_restore_standard_config(pci_dev); - pci_dev->state_saved = false; - pci_fixup_device(pci_fixup_resume_early, pci_dev); -} - -static void pci_pm_default_resume(struct pci_dev *pci_dev) -{ - pci_fixup_device(pci_fixup_resume, pci_dev); - - if (!pci_is_bridge(pci_dev)) - pci_enable_wake(pci_dev, PCI_D0, false); -} - -static void pci_pm_default_suspend(struct pci_dev *pci_dev) -{ - /* Disable non-bridge devices without PM support */ - if (!pci_is_bridge(pci_dev)) - pci_disable_enabled_device(pci_dev); - pci_save_state(pci_dev); -} - -static bool pci_has_legacy_pm_support(struct pci_dev *pci_dev) -{ - struct pci_driver *drv = pci_dev->driver; - bool ret = drv && (drv->suspend || drv->suspend_late || drv->resume - || drv->resume_early); - - /* - * Legacy PM support is used by default, so warn if the new framework is - * supported as well. Drivers are supposed to support either the - * former, or the latter, but not both at the same time. - */ - WARN_ON(ret && drv->driver.pm); - - return ret; -} - -/* New power management framework */ - -static int pci_pm_prepare(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int error = 0; - - if (drv && drv->pm && drv->pm->prepare) - error = drv->pm->prepare(dev); - - return error; -} - -static void pci_pm_complete(struct device *dev) -{ - struct device_driver *drv = dev->driver; - - if (drv && drv->pm && drv->pm->complete) - drv->pm->complete(dev); -} - -#ifdef CONFIG_SUSPEND - -static int pci_pm_suspend(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_suspend(dev, PMSG_SUSPEND); - - if (!pm) { - pci_pm_default_suspend(pci_dev); - goto Fixup; - } - - pci_dev->state_saved = false; - - if (pm->suspend) { - pci_power_t prev = pci_dev->current_state; - int error; - - error = pm->suspend(dev); - suspend_report_result(pm->suspend, error); - if (error) - return error; - - if (pci_dev->state_saved) - goto Fixup; - - if (pci_dev->current_state != PCI_D0 - && pci_dev->current_state != PCI_UNKNOWN) { - WARN_ONCE(pci_dev->current_state != prev, - "PCI PM: State of device not saved by %pF\n", - pm->suspend); - goto Fixup; - } - } - - if (!pci_dev->state_saved) { - pci_save_state(pci_dev); - if (!pci_is_bridge(pci_dev)) - pci_prepare_to_sleep(pci_dev); - } - - Fixup: - pci_fixup_device(pci_fixup_suspend, pci_dev); - - return 0; -} - -static int pci_pm_suspend_noirq(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct device_driver *drv = dev->driver; - int error = 0; - - if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_suspend_late(dev, PMSG_SUSPEND); - - if (drv && drv->pm && drv->pm->suspend_noirq) { - error = drv->pm->suspend_noirq(dev); - suspend_report_result(drv->pm->suspend_noirq, error); - } - - if (!error) - pci_pm_set_unknown_state(pci_dev); - - return error; -} - -static int pci_pm_resume_noirq(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct device_driver *drv = dev->driver; - int error = 0; - - pci_pm_default_resume_noirq(pci_dev); - - if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_resume_early(dev); - - if (drv && drv->pm && drv->pm->resume_noirq) - error = drv->pm->resume_noirq(dev); - - return error; -} - -static int pci_pm_resume(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - int error = 0; - - /* - * This is necessary for the suspend error path in which resume is - * called without restoring the standard config registers of the device. - */ - if (pci_dev->state_saved) - pci_restore_standard_config(pci_dev); - - if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_resume(dev); - - pci_pm_default_resume(pci_dev); - - if (pm) { - if (pm->resume) - error = pm->resume(dev); - } else { - pci_pm_reenable_device(pci_dev); - } - - return 0; -} - -#else /* !CONFIG_SUSPEND */ - -#define pci_pm_suspend NULL -#define pci_pm_suspend_noirq NULL -#define pci_pm_resume NULL -#define pci_pm_resume_noirq NULL - -#endif /* !CONFIG_SUSPEND */ - -#ifdef CONFIG_HIBERNATION - -static int pci_pm_freeze(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_suspend(dev, PMSG_FREEZE); - - if (!pm) { - pci_pm_default_suspend(pci_dev); - return 0; - } - - pci_dev->state_saved = false; - - if (pm->freeze) { - int error; - - error = pm->freeze(dev); - suspend_report_result(pm->freeze, error); - if (error) - return error; - } - - if (!pci_dev->state_saved) - pci_save_state(pci_dev); - - return 0; -} - -static int pci_pm_freeze_noirq(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct device_driver *drv = dev->driver; - int error = 0; - - if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_suspend_late(dev, PMSG_FREEZE); - - if (drv && drv->pm && drv->pm->freeze_noirq) { - error = drv->pm->freeze_noirq(dev); - suspend_report_result(drv->pm->freeze_noirq, error); - } - - if (!error) - pci_pm_set_unknown_state(pci_dev); - - return error; -} - -static int pci_pm_thaw_noirq(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct device_driver *drv = dev->driver; - int error = 0; - - if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_resume_early(dev); - - pci_update_current_state(pci_dev, PCI_D0); - - if (drv && drv->pm && drv->pm->thaw_noirq) - error = drv->pm->thaw_noirq(dev); - - return error; -} - -static int pci_pm_thaw(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - int error = 0; - - if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_resume(dev); - - if (pm) { - if (pm->thaw) - error = pm->thaw(dev); - } else { - pci_pm_reenable_device(pci_dev); - } - - return error; -} - -static int pci_pm_poweroff(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - int error = 0; - - if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_suspend(dev, PMSG_HIBERNATE); - - if (!pm) { - pci_pm_default_suspend(pci_dev); - goto Fixup; - } - - pci_dev->state_saved = false; - - if (pm->poweroff) { - error = pm->poweroff(dev); - suspend_report_result(pm->poweroff, error); - } - - if (!pci_dev->state_saved && !pci_is_bridge(pci_dev)) - pci_prepare_to_sleep(pci_dev); - - Fixup: - pci_fixup_device(pci_fixup_suspend, pci_dev); - - return error; -} - -static int pci_pm_poweroff_noirq(struct device *dev) -{ - struct device_driver *drv = dev->driver; - int error = 0; - - if (pci_has_legacy_pm_support(to_pci_dev(dev))) - return pci_legacy_suspend_late(dev, PMSG_HIBERNATE); - - if (drv && drv->pm && drv->pm->poweroff_noirq) { - error = drv->pm->poweroff_noirq(dev); - suspend_report_result(drv->pm->poweroff_noirq, error); - } - - return error; -} - -static int pci_pm_restore_noirq(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct device_driver *drv = dev->driver; - int error = 0; - - pci_pm_default_resume_noirq(pci_dev); - - if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_resume_early(dev); - - if (drv && drv->pm && drv->pm->restore_noirq) - error = drv->pm->restore_noirq(dev); - - return error; -} - -static int pci_pm_restore(struct device *dev) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - int error = 0; - - /* - * This is necessary for the hibernation error path in which restore is - * called without restoring the standard config registers of the device. - */ - if (pci_dev->state_saved) - pci_restore_standard_config(pci_dev); - - if (pci_has_legacy_pm_support(pci_dev)) - return pci_legacy_resume(dev); - - pci_pm_default_resume(pci_dev); - - if (pm) { - if (pm->restore) - error = pm->restore(dev); - } else { - pci_pm_reenable_device(pci_dev); - } - - return error; -} - -#else /* !CONFIG_HIBERNATION */ - -#define pci_pm_freeze NULL -#define pci_pm_freeze_noirq NULL -#define pci_pm_thaw NULL -#define pci_pm_thaw_noirq NULL -#define pci_pm_poweroff NULL -#define pci_pm_poweroff_noirq NULL -#define pci_pm_restore NULL -#define pci_pm_restore_noirq NULL - -#endif /* !CONFIG_HIBERNATION */ - -struct dev_pm_ops pci_dev_pm_ops = { - .prepare = pci_pm_prepare, - .complete = pci_pm_complete, - .suspend = pci_pm_suspend, - .resume = pci_pm_resume, - .freeze = pci_pm_freeze, - .thaw = pci_pm_thaw, - .poweroff = pci_pm_poweroff, - .restore = pci_pm_restore, - .suspend_noirq = pci_pm_suspend_noirq, - .resume_noirq = pci_pm_resume_noirq, - .freeze_noirq = pci_pm_freeze_noirq, - .thaw_noirq = pci_pm_thaw_noirq, - .poweroff_noirq = pci_pm_poweroff_noirq, - .restore_noirq = pci_pm_restore_noirq, -}; - -#define PCI_PM_OPS_PTR (&pci_dev_pm_ops) - -#else /* !CONFIG_PM_SLEEP */ - -#define PCI_PM_OPS_PTR NULL - -#endif /* !CONFIG_PM_SLEEP */ - -/** - * __pci_register_driver - register a new pci driver - * @drv: the driver structure to register - * @owner: owner module of drv - * @mod_name: module name string - * - * Adds the driver structure to the list of registered drivers. - * Returns a negative value on error, otherwise 0. - * If no error occurred, the driver remains registered even if - * no device was claimed during registration. - */ -int __pci_register_driver(struct pci_driver *drv, struct module *owner, - const char *mod_name) -{ - int error; - - /* initialize common driver fields */ - drv->driver.name = drv->name; - drv->driver.bus = &pci_bus_type; - drv->driver.owner = owner; - drv->driver.mod_name = mod_name; - - spin_lock_init(&drv->dynids.lock); - INIT_LIST_HEAD(&drv->dynids.list); - - /* register with core */ - error = driver_register(&drv->driver); - if (error) - return error; - - error = pci_create_newid_file(drv); - if (error) - driver_unregister(&drv->driver); - - return error; -} - -/** - * pci_unregister_driver - unregister a pci driver - * @drv: the driver structure to unregister - * - * Deletes the driver structure from the list of registered PCI drivers, - * gives it a chance to clean up by calling its remove() function for - * each device it was responsible for, and marks those devices as - * driverless. - */ - -void -pci_unregister_driver(struct pci_driver *drv) -{ - pci_remove_newid_file(drv); - driver_unregister(&drv->driver); - pci_free_dynids(drv); -} - -static struct pci_driver pci_compat_driver = { - .name = "compat" -}; - -/** - * pci_dev_driver - get the pci_driver of a device - * @dev: the device to query - * - * Returns the appropriate pci_driver structure or %NULL if there is no - * registered driver for the device. - */ -struct pci_driver * -pci_dev_driver(const struct pci_dev *dev) -{ - if (dev->driver) - return dev->driver; - else { - int i; - for(i=0; i<=PCI_ROM_RESOURCE; i++) - if (dev->resource[i].flags & IORESOURCE_BUSY) - return &pci_compat_driver; - } - return NULL; -} - -/** - * pci_bus_match - Tell if a PCI device structure has a matching PCI device id structure - * @dev: the PCI device structure to match against - * @drv: the device driver to search for matching PCI device id structures - * - * Used by a driver to check whether a PCI device present in the - * system is in its list of supported devices. Returns the matching - * pci_device_id structure or %NULL if there is no match. - */ -static int pci_bus_match(struct device *dev, struct device_driver *drv) -{ - struct pci_dev *pci_dev = to_pci_dev(dev); - struct pci_driver *pci_drv = to_pci_driver(drv); - const struct pci_device_id *found_id; - - found_id = pci_match_device(pci_drv, pci_dev); - if (found_id) - return 1; - - return 0; -} - -/** - * pci_dev_get - increments the reference count of the pci device structure - * @dev: the device being referenced - * - * Each live reference to a device should be refcounted. - * - * Drivers for PCI devices should normally record such references in - * their probe() methods, when they bind to a device, and release - * them by calling pci_dev_put(), in their disconnect() methods. - * - * A pointer to the device with the incremented reference counter is returned. - */ -struct pci_dev *pci_dev_get(struct pci_dev *dev) -{ - if (dev) - get_device(&dev->dev); - return dev; -} - -/** - * pci_dev_put - release a use of the pci device structure - * @dev: device that's been disconnected - * - * Must be called when a user of a device is finished with it. When the last - * user of the device calls this function, the memory of the device is freed. - */ -void pci_dev_put(struct pci_dev *dev) -{ - if (dev) - put_device(&dev->dev); -} - -#ifndef CONFIG_HOTPLUG -int pci_uevent(struct device *dev, struct kobj_uevent_env *env) -{ - return -ENODEV; -} -#endif - -struct bus_type pci_bus_type = { - .name = "pci", - .match = pci_bus_match, - .uevent = pci_uevent, - .probe = pci_device_probe, - .remove = pci_device_remove, - .shutdown = pci_device_shutdown, -#ifndef DDE_LINUX - .dev_attrs = pci_dev_attrs, -#endif - .pm = PCI_PM_OPS_PTR, -}; - -static int __init pci_driver_init(void) -{ - return bus_register(&pci_bus_type); -} - -postcore_initcall(pci_driver_init); - -EXPORT_SYMBOL(pci_match_id); -EXPORT_SYMBOL(__pci_register_driver); -EXPORT_SYMBOL(pci_unregister_driver); -EXPORT_SYMBOL(pci_dev_driver); -EXPORT_SYMBOL(pci_bus_type); -EXPORT_SYMBOL(pci_dev_get); -EXPORT_SYMBOL(pci_dev_put); diff --git a/libdde_linux26/lib/src/drivers/pci/.svn/text-base/pci.c.svn-base b/libdde_linux26/lib/src/drivers/pci/.svn/text-base/pci.c.svn-base deleted file mode 100644 index f67bf734..00000000 --- a/libdde_linux26/lib/src/drivers/pci/.svn/text-base/pci.c.svn-base +++ /dev/null @@ -1,2480 +0,0 @@ -/* - * PCI Bus Services, see include/linux/pci.h for further explanation. - * - * Copyright 1993 -- 1997 Drew Eckhardt, Frederic Potter, - * David Mosberger-Tang - * - * Copyright 1997 -- 2000 Martin Mares <mj@ucw.cz> - */ - -#include <linux/kernel.h> -#include <linux/delay.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <linux/pm.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/log2.h> -#include <linux/pci-aspm.h> -#include <linux/pm_wakeup.h> -#include <linux/interrupt.h> -#include <asm/dma.h> /* isa_dma_bridge_buggy */ -#include "pci.h" - -#ifdef DDE_LINUX -#include "local.h" -#endif - -unsigned int pci_pm_d3_delay = PCI_PM_D3_WAIT; - -#ifdef CONFIG_PCI_DOMAINS -int pci_domains_supported = 1; -#endif - -#define DEFAULT_CARDBUS_IO_SIZE (256) -#define DEFAULT_CARDBUS_MEM_SIZE (64*1024*1024) -/* pci=cbmemsize=nnM,cbiosize=nn can override this */ -unsigned long pci_cardbus_io_size = DEFAULT_CARDBUS_IO_SIZE; -unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE; - -/** - * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children - * @bus: pointer to PCI bus structure to search - * - * Given a PCI bus, returns the highest PCI bus number present in the set - * including the given PCI bus and its list of child PCI buses. - */ -unsigned char pci_bus_max_busnr(struct pci_bus* bus) -{ - struct list_head *tmp; - unsigned char max, n; - - max = bus->subordinate; - list_for_each(tmp, &bus->children) { - n = pci_bus_max_busnr(pci_bus_b(tmp)); - if(n > max) - max = n; - } - return max; -} -EXPORT_SYMBOL_GPL(pci_bus_max_busnr); - -#ifdef CONFIG_HAS_IOMEM -void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar) -{ - /* - * Make sure the BAR is actually a memory resource, not an IO resource - */ - if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) { - WARN_ON(1); - return NULL; - } - return ioremap_nocache(pci_resource_start(pdev, bar), - pci_resource_len(pdev, bar)); -} -EXPORT_SYMBOL_GPL(pci_ioremap_bar); -#endif - -#if 0 -/** - * pci_max_busnr - returns maximum PCI bus number - * - * Returns the highest PCI bus number present in the system global list of - * PCI buses. - */ -unsigned char __devinit -pci_max_busnr(void) -{ - struct pci_bus *bus = NULL; - unsigned char max, n; - - max = 0; - while ((bus = pci_find_next_bus(bus)) != NULL) { - n = pci_bus_max_busnr(bus); - if(n > max) - max = n; - } - return max; -} - -#endif /* 0 */ - -#define PCI_FIND_CAP_TTL 48 - -static int __pci_find_next_cap_ttl(struct pci_bus *bus, unsigned int devfn, - u8 pos, int cap, int *ttl) -{ - u8 id; - - while ((*ttl)--) { - pci_bus_read_config_byte(bus, devfn, pos, &pos); - if (pos < 0x40) - break; - pos &= ~3; - pci_bus_read_config_byte(bus, devfn, pos + PCI_CAP_LIST_ID, - &id); - if (id == 0xff) - break; - if (id == cap) - return pos; - pos += PCI_CAP_LIST_NEXT; - } - return 0; -} - -static int __pci_find_next_cap(struct pci_bus *bus, unsigned int devfn, - u8 pos, int cap) -{ - int ttl = PCI_FIND_CAP_TTL; - - return __pci_find_next_cap_ttl(bus, devfn, pos, cap, &ttl); -} - -int pci_find_next_capability(struct pci_dev *dev, u8 pos, int cap) -{ - return __pci_find_next_cap(dev->bus, dev->devfn, - pos + PCI_CAP_LIST_NEXT, cap); -} -EXPORT_SYMBOL_GPL(pci_find_next_capability); - -static int __pci_bus_find_cap_start(struct pci_bus *bus, - unsigned int devfn, u8 hdr_type) -{ - u16 status; - - pci_bus_read_config_word(bus, devfn, PCI_STATUS, &status); - if (!(status & PCI_STATUS_CAP_LIST)) - return 0; - - switch (hdr_type) { - case PCI_HEADER_TYPE_NORMAL: - case PCI_HEADER_TYPE_BRIDGE: - return PCI_CAPABILITY_LIST; - case PCI_HEADER_TYPE_CARDBUS: - return PCI_CB_CAPABILITY_LIST; - default: - return 0; - } - - return 0; -} - -/** - * pci_find_capability - query for devices' capabilities - * @dev: PCI device to query - * @cap: capability code - * - * Tell if a device supports a given PCI capability. - * Returns the address of the requested capability structure within the - * device's PCI configuration space or 0 in case the device does not - * support it. Possible values for @cap: - * - * %PCI_CAP_ID_PM Power Management - * %PCI_CAP_ID_AGP Accelerated Graphics Port - * %PCI_CAP_ID_VPD Vital Product Data - * %PCI_CAP_ID_SLOTID Slot Identification - * %PCI_CAP_ID_MSI Message Signalled Interrupts - * %PCI_CAP_ID_CHSWP CompactPCI HotSwap - * %PCI_CAP_ID_PCIX PCI-X - * %PCI_CAP_ID_EXP PCI Express - */ -int pci_find_capability(struct pci_dev *dev, int cap) -{ - int pos; - - pos = __pci_bus_find_cap_start(dev->bus, dev->devfn, dev->hdr_type); - if (pos) - pos = __pci_find_next_cap(dev->bus, dev->devfn, pos, cap); - - return pos; -} - -/** - * pci_bus_find_capability - query for devices' capabilities - * @bus: the PCI bus to query - * @devfn: PCI device to query - * @cap: capability code - * - * Like pci_find_capability() but works for pci devices that do not have a - * pci_dev structure set up yet. - * - * Returns the address of the requested capability structure within the - * device's PCI configuration space or 0 in case the device does not - * support it. - */ -int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap) -{ - int pos; - u8 hdr_type; - - pci_bus_read_config_byte(bus, devfn, PCI_HEADER_TYPE, &hdr_type); - - pos = __pci_bus_find_cap_start(bus, devfn, hdr_type & 0x7f); - if (pos) - pos = __pci_find_next_cap(bus, devfn, pos, cap); - - return pos; -} - -/** - * pci_find_ext_capability - Find an extended capability - * @dev: PCI device to query - * @cap: capability code - * - * Returns the address of the requested extended capability structure - * within the device's PCI configuration space or 0 if the device does - * not support it. Possible values for @cap: - * - * %PCI_EXT_CAP_ID_ERR Advanced Error Reporting - * %PCI_EXT_CAP_ID_VC Virtual Channel - * %PCI_EXT_CAP_ID_DSN Device Serial Number - * %PCI_EXT_CAP_ID_PWR Power Budgeting - */ -int pci_find_ext_capability(struct pci_dev *dev, int cap) -{ - u32 header; - int ttl; - int pos = PCI_CFG_SPACE_SIZE; - - /* minimum 8 bytes per capability */ - ttl = (PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) / 8; - - if (dev->cfg_size <= PCI_CFG_SPACE_SIZE) - return 0; - - if (pci_read_config_dword(dev, pos, &header) != PCIBIOS_SUCCESSFUL) - return 0; - - /* - * If we have no capabilities, this is indicated by cap ID, - * cap version and next pointer all being 0. - */ - if (header == 0) - return 0; - - while (ttl-- > 0) { - if (PCI_EXT_CAP_ID(header) == cap) - return pos; - - pos = PCI_EXT_CAP_NEXT(header); - if (pos < PCI_CFG_SPACE_SIZE) - break; - - if (pci_read_config_dword(dev, pos, &header) != PCIBIOS_SUCCESSFUL) - break; - } - - return 0; -} -EXPORT_SYMBOL_GPL(pci_find_ext_capability); - -static int __pci_find_next_ht_cap(struct pci_dev *dev, int pos, int ht_cap) -{ - int rc, ttl = PCI_FIND_CAP_TTL; - u8 cap, mask; - - if (ht_cap == HT_CAPTYPE_SLAVE || ht_cap == HT_CAPTYPE_HOST) - mask = HT_3BIT_CAP_MASK; - else - mask = HT_5BIT_CAP_MASK; - - pos = __pci_find_next_cap_ttl(dev->bus, dev->devfn, pos, - PCI_CAP_ID_HT, &ttl); - while (pos) { - rc = pci_read_config_byte(dev, pos + 3, &cap); - if (rc != PCIBIOS_SUCCESSFUL) - return 0; - - if ((cap & mask) == ht_cap) - return pos; - - pos = __pci_find_next_cap_ttl(dev->bus, dev->devfn, - pos + PCI_CAP_LIST_NEXT, - PCI_CAP_ID_HT, &ttl); - } - - return 0; -} -/** - * pci_find_next_ht_capability - query a device's Hypertransport capabilities - * @dev: PCI device to query - * @pos: Position from which to continue searching - * @ht_cap: Hypertransport capability code - * - * To be used in conjunction with pci_find_ht_capability() to search for - * all capabilities matching @ht_cap. @pos should always be a value returned - * from pci_find_ht_capability(). - * - * NB. To be 100% safe against broken PCI devices, the caller should take - * steps to avoid an infinite loop. - */ -int pci_find_next_ht_capability(struct pci_dev *dev, int pos, int ht_cap) -{ - return __pci_find_next_ht_cap(dev, pos + PCI_CAP_LIST_NEXT, ht_cap); -} -EXPORT_SYMBOL_GPL(pci_find_next_ht_capability); - -/** - * pci_find_ht_capability - query a device's Hypertransport capabilities - * @dev: PCI device to query - * @ht_cap: Hypertransport capability code - * - * Tell if a device supports a given Hypertransport capability. - * Returns an address within the device's PCI configuration space - * or 0 in case the device does not support the request capability. - * The address points to the PCI capability, of type PCI_CAP_ID_HT, - * which has a Hypertransport capability matching @ht_cap. - */ -int pci_find_ht_capability(struct pci_dev *dev, int ht_cap) -{ - int pos; - - pos = __pci_bus_find_cap_start(dev->bus, dev->devfn, dev->hdr_type); - if (pos) - pos = __pci_find_next_ht_cap(dev, pos, ht_cap); - - return pos; -} -EXPORT_SYMBOL_GPL(pci_find_ht_capability); - -/** - * pci_find_parent_resource - return resource region of parent bus of given region - * @dev: PCI device structure contains resources to be searched - * @res: child resource record for which parent is sought - * - * For given resource region of given device, return the resource - * region of parent bus the given region is contained in or where - * it should be allocated from. - */ -struct resource * -pci_find_parent_resource(const struct pci_dev *dev, struct resource *res) -{ - const struct pci_bus *bus = dev->bus; - int i; - struct resource *best = NULL; - - for(i = 0; i < PCI_BUS_NUM_RESOURCES; i++) { - struct resource *r = bus->resource[i]; - if (!r) - continue; - if (res->start && !(res->start >= r->start && res->end <= r->end)) - continue; /* Not contained */ - if ((res->flags ^ r->flags) & (IORESOURCE_IO | IORESOURCE_MEM)) - continue; /* Wrong type */ - if (!((res->flags ^ r->flags) & IORESOURCE_PREFETCH)) - return r; /* Exact match */ - if ((res->flags & IORESOURCE_PREFETCH) && !(r->flags & IORESOURCE_PREFETCH)) - best = r; /* Approximating prefetchable by non-prefetchable */ - } - return best; -} - -/** - * pci_restore_bars - restore a devices BAR values (e.g. after wake-up) - * @dev: PCI device to have its BARs restored - * - * Restore the BAR values for a given device, so as to make it - * accessible by its driver. - */ -static void -pci_restore_bars(struct pci_dev *dev) -{ - int i; - - for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) - pci_update_resource(dev, i); -} - -static struct pci_platform_pm_ops *pci_platform_pm; - -int pci_set_platform_pm(struct pci_platform_pm_ops *ops) -{ - if (!ops->is_manageable || !ops->set_state || !ops->choose_state - || !ops->sleep_wake || !ops->can_wakeup) - return -EINVAL; - pci_platform_pm = ops; - return 0; -} - -static inline bool platform_pci_power_manageable(struct pci_dev *dev) -{ - return pci_platform_pm ? pci_platform_pm->is_manageable(dev) : false; -} - -static inline int platform_pci_set_power_state(struct pci_dev *dev, - pci_power_t t) -{ - return pci_platform_pm ? pci_platform_pm->set_state(dev, t) : -ENOSYS; -} - -static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev) -{ - return pci_platform_pm ? - pci_platform_pm->choose_state(dev) : PCI_POWER_ERROR; -} - -static inline bool platform_pci_can_wakeup(struct pci_dev *dev) -{ - return pci_platform_pm ? pci_platform_pm->can_wakeup(dev) : false; -} - -static inline int platform_pci_sleep_wake(struct pci_dev *dev, bool enable) -{ - return pci_platform_pm ? - pci_platform_pm->sleep_wake(dev, enable) : -ENODEV; -} - -/** - * pci_raw_set_power_state - Use PCI PM registers to set the power state of - * given PCI device - * @dev: PCI device to handle. - * @state: PCI power state (D0, D1, D2, D3hot) to put the device into. - * @wait: If 'true', wait for the device to change its power state - * - * RETURN VALUE: - * -EINVAL if the requested state is invalid. - * -EIO if device does not support PCI PM or its PM capabilities register has a - * wrong version, or device doesn't support the requested state. - * 0 if device already is in the requested state. - * 0 if device's power state has been successfully changed. - */ -static int -pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state, bool wait) -{ - u16 pmcsr; - bool need_restore = false; - - if (!dev->pm_cap) - return -EIO; - - if (state < PCI_D0 || state > PCI_D3hot) - return -EINVAL; - - /* Validate current state: - * Can enter D0 from any state, but if we can only go deeper - * to sleep if we're already in a low power state - */ - if (dev->current_state == state) { - /* we're already there */ - return 0; - } else if (state != PCI_D0 && dev->current_state <= PCI_D3cold - && dev->current_state > state) { - dev_err(&dev->dev, "invalid power transition " - "(from state %d to %d)\n", dev->current_state, state); - return -EINVAL; - } - - /* check if this device supports the desired state */ - if ((state == PCI_D1 && !dev->d1_support) - || (state == PCI_D2 && !dev->d2_support)) - return -EIO; - - pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); - - /* If we're (effectively) in D3, force entire word to 0. - * This doesn't affect PME_Status, disables PME_En, and - * sets PowerState to 0. - */ - switch (dev->current_state) { - case PCI_D0: - case PCI_D1: - case PCI_D2: - pmcsr &= ~PCI_PM_CTRL_STATE_MASK; - pmcsr |= state; - break; - case PCI_UNKNOWN: /* Boot-up */ - if ((pmcsr & PCI_PM_CTRL_STATE_MASK) == PCI_D3hot - && !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET)) { - need_restore = true; - wait = true; - } - /* Fall-through: force to D0 */ - default: - pmcsr = 0; - break; - } - - /* enter specified state */ - pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr); - - if (!wait) - return 0; - - /* Mandatory power management transition delays */ - /* see PCI PM 1.1 5.6.1 table 18 */ - if (state == PCI_D3hot || dev->current_state == PCI_D3hot) - msleep(pci_pm_d3_delay); - else if (state == PCI_D2 || dev->current_state == PCI_D2) - udelay(PCI_PM_D2_DELAY); - - dev->current_state = state; - - /* According to section 5.4.1 of the "PCI BUS POWER MANAGEMENT - * INTERFACE SPECIFICATION, REV. 1.2", a device transitioning - * from D3hot to D0 _may_ perform an internal reset, thereby - * going to "D0 Uninitialized" rather than "D0 Initialized". - * For example, at least some versions of the 3c905B and the - * 3c556B exhibit this behaviour. - * - * At least some laptop BIOSen (e.g. the Thinkpad T21) leave - * devices in a D3hot state at boot. Consequently, we need to - * restore at least the BARs so that the device will be - * accessible to its driver. - */ - if (need_restore) - pci_restore_bars(dev); - - if (wait && dev->bus->self) - pcie_aspm_pm_state_change(dev->bus->self); - - return 0; -} - -/** - * pci_update_current_state - Read PCI power state of given device from its - * PCI PM registers and cache it - * @dev: PCI device to handle. - * @state: State to cache in case the device doesn't have the PM capability - */ -void pci_update_current_state(struct pci_dev *dev, pci_power_t state) -{ - if (dev->pm_cap) { - u16 pmcsr; - - pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); - dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); - } else { - dev->current_state = state; - } -} - -/** - * pci_set_power_state - Set the power state of a PCI device - * @dev: PCI device to handle. - * @state: PCI power state (D0, D1, D2, D3hot) to put the device into. - * - * Transition a device to a new power state, using the platform formware and/or - * the device's PCI PM registers. - * - * RETURN VALUE: - * -EINVAL if the requested state is invalid. - * -EIO if device does not support PCI PM or its PM capabilities register has a - * wrong version, or device doesn't support the requested state. - * 0 if device already is in the requested state. - * 0 if device's power state has been successfully changed. - */ -int pci_set_power_state(struct pci_dev *dev, pci_power_t state) -{ - int error; - - /* bound the state we're entering */ - if (state > PCI_D3hot) - state = PCI_D3hot; - else if (state < PCI_D0) - state = PCI_D0; - else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev)) - /* - * If the device or the parent bridge do not support PCI PM, - * ignore the request if we're doing anything other than putting - * it into D0 (which would only happen on boot). - */ - return 0; - - if (state == PCI_D0 && platform_pci_power_manageable(dev)) { - /* - * Allow the platform to change the state, for example via ACPI - * _PR0, _PS0 and some such, but do not trust it. - */ - int ret = platform_pci_set_power_state(dev, PCI_D0); - if (!ret) - pci_update_current_state(dev, PCI_D0); - } - /* This device is quirked not to be put into D3, so - don't put it in D3 */ - if (state == PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3)) - return 0; - - error = pci_raw_set_power_state(dev, state, true); - - if (state > PCI_D0 && platform_pci_power_manageable(dev)) { - /* Allow the platform to finalize the transition */ - int ret = platform_pci_set_power_state(dev, state); - if (!ret) { - pci_update_current_state(dev, state); - error = 0; - } - } - - return error; -} - -/** - * pci_choose_state - Choose the power state of a PCI device - * @dev: PCI device to be suspended - * @state: target sleep state for the whole system. This is the value - * that is passed to suspend() function. - * - * Returns PCI power state suitable for given device and given system - * message. - */ - -pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) -{ - pci_power_t ret; - - if (!pci_find_capability(dev, PCI_CAP_ID_PM)) - return PCI_D0; - - ret = platform_pci_choose_state(dev); - if (ret != PCI_POWER_ERROR) - return ret; - - switch (state.event) { - case PM_EVENT_ON: - return PCI_D0; - case PM_EVENT_FREEZE: - case PM_EVENT_PRETHAW: - /* REVISIT both freeze and pre-thaw "should" use D0 */ - case PM_EVENT_SUSPEND: - case PM_EVENT_HIBERNATE: - return PCI_D3hot; - default: - dev_info(&dev->dev, "unrecognized suspend event %d\n", - state.event); - BUG(); - } - return PCI_D0; -} - -EXPORT_SYMBOL(pci_choose_state); - -static int pci_save_pcie_state(struct pci_dev *dev) -{ - int pos, i = 0; - struct pci_cap_saved_state *save_state; - u16 *cap; - - pos = pci_find_capability(dev, PCI_CAP_ID_EXP); - if (pos <= 0) - return 0; - - save_state = pci_find_saved_cap(dev, PCI_CAP_ID_EXP); - if (!save_state) { - dev_err(&dev->dev, "buffer not found in %s\n", __FUNCTION__); - return -ENOMEM; - } - cap = (u16 *)&save_state->data[0]; - - pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &cap[i++]); - pci_read_config_word(dev, pos + PCI_EXP_LNKCTL, &cap[i++]); - pci_read_config_word(dev, pos + PCI_EXP_SLTCTL, &cap[i++]); - pci_read_config_word(dev, pos + PCI_EXP_RTCTL, &cap[i++]); - - return 0; -} - -static void pci_restore_pcie_state(struct pci_dev *dev) -{ - int i = 0, pos; - struct pci_cap_saved_state *save_state; - u16 *cap; - - save_state = pci_find_saved_cap(dev, PCI_CAP_ID_EXP); - pos = pci_find_capability(dev, PCI_CAP_ID_EXP); - if (!save_state || pos <= 0) - return; - cap = (u16 *)&save_state->data[0]; - - pci_write_config_word(dev, pos + PCI_EXP_DEVCTL, cap[i++]); - pci_write_config_word(dev, pos + PCI_EXP_LNKCTL, cap[i++]); - pci_write_config_word(dev, pos + PCI_EXP_SLTCTL, cap[i++]); - pci_write_config_word(dev, pos + PCI_EXP_RTCTL, cap[i++]); -} - - -static int pci_save_pcix_state(struct pci_dev *dev) -{ - int pos; - struct pci_cap_saved_state *save_state; - - pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); - if (pos <= 0) - return 0; - - save_state = pci_find_saved_cap(dev, PCI_CAP_ID_PCIX); - if (!save_state) { - dev_err(&dev->dev, "buffer not found in %s\n", __FUNCTION__); - return -ENOMEM; - } - - pci_read_config_word(dev, pos + PCI_X_CMD, (u16 *)save_state->data); - - return 0; -} - -static void pci_restore_pcix_state(struct pci_dev *dev) -{ - int i = 0, pos; - struct pci_cap_saved_state *save_state; - u16 *cap; - - save_state = pci_find_saved_cap(dev, PCI_CAP_ID_PCIX); - pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); - if (!save_state || pos <= 0) - return; - cap = (u16 *)&save_state->data[0]; - - pci_write_config_word(dev, pos + PCI_X_CMD, cap[i++]); -} - - -/** - * pci_save_state - save the PCI configuration space of a device before suspending - * @dev: - PCI device that we're dealing with - */ -int -pci_save_state(struct pci_dev *dev) -{ - int i; - /* XXX: 100% dword access ok here? */ - for (i = 0; i < 16; i++) - pci_read_config_dword(dev, i * 4,&dev->saved_config_space[i]); - dev->state_saved = true; - if ((i = pci_save_pcie_state(dev)) != 0) - return i; - if ((i = pci_save_pcix_state(dev)) != 0) - return i; - return 0; -} - -/** - * pci_restore_state - Restore the saved state of a PCI device - * @dev: - PCI device that we're dealing with - */ -int -pci_restore_state(struct pci_dev *dev) -{ - int i; - u32 val; - - /* PCI Express register must be restored first */ - pci_restore_pcie_state(dev); - - /* - * The Base Address register should be programmed before the command - * register(s) - */ - for (i = 15; i >= 0; i--) { - pci_read_config_dword(dev, i * 4, &val); - if (val != dev->saved_config_space[i]) { - dev_printk(KERN_DEBUG, &dev->dev, "restoring config " - "space at offset %#x (was %#x, writing %#x)\n", - i, val, (int)dev->saved_config_space[i]); - pci_write_config_dword(dev,i * 4, - dev->saved_config_space[i]); - } - } - pci_restore_pcix_state(dev); - pci_restore_msi_state(dev); - - return 0; -} - -static int do_pci_enable_device(struct pci_dev *dev, int bars) -{ - int err; - - err = pci_set_power_state(dev, PCI_D0); - if (err < 0 && err != -EIO) - return err; - err = pcibios_enable_device(dev, bars); - if (err < 0) - return err; - pci_fixup_device(pci_fixup_enable, dev); - - return 0; -} - -/** - * pci_reenable_device - Resume abandoned device - * @dev: PCI device to be resumed - * - * Note this function is a backend of pci_default_resume and is not supposed - * to be called by normal code, write proper resume handler and use it instead. - */ -int pci_reenable_device(struct pci_dev *dev) -{ - if (atomic_read(&dev->enable_cnt)) - return do_pci_enable_device(dev, (1 << PCI_NUM_RESOURCES) - 1); - return 0; -} - -static int __pci_enable_device_flags(struct pci_dev *dev, - resource_size_t flags) -{ - int err; - int i, bars = 0; - - if (atomic_add_return(1, &dev->enable_cnt) > 1) - return 0; /* already enabled */ - - for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) - if (dev->resource[i].flags & flags) - bars |= (1 << i); - - err = do_pci_enable_device(dev, bars); - if (err < 0) - atomic_dec(&dev->enable_cnt); - return err; -} - -/** - * pci_enable_device_io - Initialize a device for use with IO space - * @dev: PCI device to be initialized - * - * Initialize device before it's used by a driver. Ask low-level code - * to enable I/O resources. Wake up the device if it was suspended. - * Beware, this function can fail. - */ -int pci_enable_device_io(struct pci_dev *dev) -{ - return __pci_enable_device_flags(dev, IORESOURCE_IO); -} - -/** - * pci_enable_device_mem - Initialize a device for use with Memory space - * @dev: PCI device to be initialized - * - * Initialize device before it's used by a driver. Ask low-level code - * to enable Memory resources. Wake up the device if it was suspended. - * Beware, this function can fail. - */ -int pci_enable_device_mem(struct pci_dev *dev) -{ - return __pci_enable_device_flags(dev, IORESOURCE_MEM); -} - -/** pci_enable_device() is implemented by the DDE. */ -#ifndef DDE_LINUX -/** - * pci_enable_device - Initialize device before it's used by a driver. - * @dev: PCI device to be initialized - * - * Initialize device before it's used by a driver. Ask low-level code - * to enable I/O and memory. Wake up the device if it was suspended. - * Beware, this function can fail. - * - * Note we don't actually enable the device many times if we call - * this function repeatedly (we just increment the count). - */ -int pci_enable_device(struct pci_dev *dev) -{ - return __pci_enable_device_flags(dev, IORESOURCE_MEM | IORESOURCE_IO); -} -#endif - -/* - * Managed PCI resources. This manages device on/off, intx/msi/msix - * on/off and BAR regions. pci_dev itself records msi/msix status, so - * there's no need to track it separately. pci_devres is initialized - * when a device is enabled using managed PCI device enable interface. - */ -struct pci_devres { - unsigned int enabled:1; - unsigned int pinned:1; - unsigned int orig_intx:1; - unsigned int restore_intx:1; - u32 region_mask; -}; - -static void pcim_release(struct device *gendev, void *res) -{ - struct pci_dev *dev = container_of(gendev, struct pci_dev, dev); - struct pci_devres *this = res; - int i; - - if (dev->msi_enabled) - pci_disable_msi(dev); - if (dev->msix_enabled) - pci_disable_msix(dev); - - for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) - if (this->region_mask & (1 << i)) - pci_release_region(dev, i); - - if (this->restore_intx) - pci_intx(dev, this->orig_intx); - - if (this->enabled && !this->pinned) - pci_disable_device(dev); -} - -static struct pci_devres * get_pci_dr(struct pci_dev *pdev) -{ - struct pci_devres *dr, *new_dr; - - dr = devres_find(&pdev->dev, pcim_release, NULL, NULL); - if (dr) - return dr; - - new_dr = devres_alloc(pcim_release, sizeof(*new_dr), GFP_KERNEL); - if (!new_dr) - return NULL; - return devres_get(&pdev->dev, new_dr, NULL, NULL); -} - -static struct pci_devres * find_pci_dr(struct pci_dev *pdev) -{ - if (pci_is_managed(pdev)) - return devres_find(&pdev->dev, pcim_release, NULL, NULL); - return NULL; -} - -/** - * pcim_enable_device - Managed pci_enable_device() - * @pdev: PCI device to be initialized - * - * Managed pci_enable_device(). - */ -int pcim_enable_device(struct pci_dev *pdev) -{ - struct pci_devres *dr; - int rc; - - dr = get_pci_dr(pdev); - if (unlikely(!dr)) - return -ENOMEM; - if (dr->enabled) - return 0; - - rc = pci_enable_device(pdev); - if (!rc) { - pdev->is_managed = 1; - dr->enabled = 1; - } - return rc; -} - -/** - * pcim_pin_device - Pin managed PCI device - * @pdev: PCI device to pin - * - * Pin managed PCI device @pdev. Pinned device won't be disabled on - * driver detach. @pdev must have been enabled with - * pcim_enable_device(). - */ -void pcim_pin_device(struct pci_dev *pdev) -{ - struct pci_devres *dr; - - dr = find_pci_dr(pdev); - WARN_ON(!dr || !dr->enabled); - if (dr) - dr->pinned = 1; -} - -#ifndef DDE_LINUX -/** - * pcibios_disable_device - disable arch specific PCI resources for device dev - * @dev: the PCI device to disable - * - * Disables architecture specific PCI resources for the device. This - * is the default implementation. Architecture implementations can - * override this. - */ -void __attribute__ ((weak)) pcibios_disable_device (struct pci_dev *dev) {} - -static void do_pci_disable_device(struct pci_dev *dev) -{ - u16 pci_command; - - pci_read_config_word(dev, PCI_COMMAND, &pci_command); - if (pci_command & PCI_COMMAND_MASTER) { - pci_command &= ~PCI_COMMAND_MASTER; - pci_write_config_word(dev, PCI_COMMAND, pci_command); - } - - pcibios_disable_device(dev); -} - -/** - * pci_disable_enabled_device - Disable device without updating enable_cnt - * @dev: PCI device to disable - * - * NOTE: This function is a backend of PCI power management routines and is - * not supposed to be called drivers. - */ -void pci_disable_enabled_device(struct pci_dev *dev) -{ - if (atomic_read(&dev->enable_cnt)) - do_pci_disable_device(dev); -} - -/** - * pci_disable_device - Disable PCI device after use - * @dev: PCI device to be disabled - * - * Signal to the system that the PCI device is not in use by the system - * anymore. This only involves disabling PCI bus-mastering, if active. - * - * Note we don't actually disable the device until all callers of - * pci_device_enable() have called pci_device_disable(). - */ -void -pci_disable_device(struct pci_dev *dev) -{ - struct pci_devres *dr; - - dr = find_pci_dr(dev); - if (dr) - dr->enabled = 0; - - if (atomic_sub_return(1, &dev->enable_cnt) != 0) - return; - - do_pci_disable_device(dev); - - dev->is_busmaster = 0; -} - -/** - * pcibios_set_pcie_reset_state - set reset state for device dev - * @dev: the PCI-E device reset - * @state: Reset state to enter into - * - * - * Sets the PCI-E reset state for the device. This is the default - * implementation. Architecture implementations can override this. - */ -int __attribute__ ((weak)) pcibios_set_pcie_reset_state(struct pci_dev *dev, - enum pcie_reset_state state) -{ - return -EINVAL; -} -#endif - -/** - * pci_set_pcie_reset_state - set reset state for device dev - * @dev: the PCI-E device reset - * @state: Reset state to enter into - * - * - * Sets the PCI reset state for the device. - */ -int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) -{ - return pcibios_set_pcie_reset_state(dev, state); -} - -/** - * pci_pme_capable - check the capability of PCI device to generate PME# - * @dev: PCI device to handle. - * @state: PCI state from which device will issue PME#. - */ -bool pci_pme_capable(struct pci_dev *dev, pci_power_t state) -{ - if (!dev->pm_cap) - return false; - - return !!(dev->pme_support & (1 << state)); -} - -/** - * pci_pme_active - enable or disable PCI device's PME# function - * @dev: PCI device to handle. - * @enable: 'true' to enable PME# generation; 'false' to disable it. - * - * The caller must verify that the device is capable of generating PME# before - * calling this function with @enable equal to 'true'. - */ -void pci_pme_active(struct pci_dev *dev, bool enable) -{ - u16 pmcsr; - - if (!dev->pm_cap) - return; - - pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); - /* Clear PME_Status by writing 1 to it and enable PME# */ - pmcsr |= PCI_PM_CTRL_PME_STATUS | PCI_PM_CTRL_PME_ENABLE; - if (!enable) - pmcsr &= ~PCI_PM_CTRL_PME_ENABLE; - - pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr); - - dev_printk(KERN_INFO, &dev->dev, "PME# %s\n", - enable ? "enabled" : "disabled"); -} - -/** - * pci_enable_wake - enable PCI device as wakeup event source - * @dev: PCI device affected - * @state: PCI state from which device will issue wakeup events - * @enable: True to enable event generation; false to disable - * - * This enables the device as a wakeup event source, or disables it. - * When such events involves platform-specific hooks, those hooks are - * called automatically by this routine. - * - * Devices with legacy power management (no standard PCI PM capabilities) - * always require such platform hooks. - * - * RETURN VALUE: - * 0 is returned on success - * -EINVAL is returned if device is not supposed to wake up the system - * Error code depending on the platform is returned if both the platform and - * the native mechanism fail to enable the generation of wake-up events - */ -int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable) -{ - int error = 0; - bool pme_done = false; - - if (enable && !device_may_wakeup(&dev->dev)) - return -EINVAL; - - /* - * According to "PCI System Architecture" 4th ed. by Tom Shanley & Don - * Anderson we should be doing PME# wake enable followed by ACPI wake - * enable. To disable wake-up we call the platform first, for symmetry. - */ - - if (!enable && platform_pci_can_wakeup(dev)) - error = platform_pci_sleep_wake(dev, false); - - if (!enable || pci_pme_capable(dev, state)) { - pci_pme_active(dev, enable); - pme_done = true; - } - - if (enable && platform_pci_can_wakeup(dev)) - error = platform_pci_sleep_wake(dev, true); - - return pme_done ? 0 : error; -} - -/** - * pci_wake_from_d3 - enable/disable device to wake up from D3_hot or D3_cold - * @dev: PCI device to prepare - * @enable: True to enable wake-up event generation; false to disable - * - * Many drivers want the device to wake up the system from D3_hot or D3_cold - * and this function allows them to set that up cleanly - pci_enable_wake() - * should not be called twice in a row to enable wake-up due to PCI PM vs ACPI - * ordering constraints. - * - * This function only returns error code if the device is not capable of - * generating PME# from both D3_hot and D3_cold, and the platform is unable to - * enable wake-up power for it. - */ -int pci_wake_from_d3(struct pci_dev *dev, bool enable) -{ - return pci_pme_capable(dev, PCI_D3cold) ? - pci_enable_wake(dev, PCI_D3cold, enable) : - pci_enable_wake(dev, PCI_D3hot, enable); -} - -/** - * pci_target_state - find an appropriate low power state for a given PCI dev - * @dev: PCI device - * - * Use underlying platform code to find a supported low power state for @dev. - * If the platform can't manage @dev, return the deepest state from which it - * can generate wake events, based on any available PME info. - */ -pci_power_t pci_target_state(struct pci_dev *dev) -{ - pci_power_t target_state = PCI_D3hot; - - if (platform_pci_power_manageable(dev)) { - /* - * Call the platform to choose the target state of the device - * and enable wake-up from this state if supported. - */ - pci_power_t state = platform_pci_choose_state(dev); - - switch (state) { - case PCI_POWER_ERROR: - case PCI_UNKNOWN: - break; - case PCI_D1: - case PCI_D2: - if (pci_no_d1d2(dev)) - break; - default: - target_state = state; - } - } else if (device_may_wakeup(&dev->dev)) { - /* - * Find the deepest state from which the device can generate - * wake-up events, make it the target state and enable device - * to generate PME#. - */ - if (!dev->pm_cap) - return PCI_POWER_ERROR; - - if (dev->pme_support) { - while (target_state - && !(dev->pme_support & (1 << target_state))) - target_state--; - } - } - - return target_state; -} - -/** - * pci_prepare_to_sleep - prepare PCI device for system-wide transition into a sleep state - * @dev: Device to handle. - * - * Choose the power state appropriate for the device depending on whether - * it can wake up the system and/or is power manageable by the platform - * (PCI_D3hot is the default) and put the device into that state. - */ -int pci_prepare_to_sleep(struct pci_dev *dev) -{ - pci_power_t target_state = pci_target_state(dev); - int error; - - if (target_state == PCI_POWER_ERROR) - return -EIO; - - pci_enable_wake(dev, target_state, true); - - error = pci_set_power_state(dev, target_state); - - if (error) - pci_enable_wake(dev, target_state, false); - - return error; -} - -/** - * pci_back_from_sleep - turn PCI device on during system-wide transition into working state - * @dev: Device to handle. - * - * Disable device's sytem wake-up capability and put it into D0. - */ -int pci_back_from_sleep(struct pci_dev *dev) -{ - pci_enable_wake(dev, PCI_D0, false); - return pci_set_power_state(dev, PCI_D0); -} - -/** - * pci_pm_init - Initialize PM functions of given PCI device - * @dev: PCI device to handle. - */ -void pci_pm_init(struct pci_dev *dev) -{ - int pm; - u16 pmc; - - dev->pm_cap = 0; - - /* find PCI PM capability in list */ - pm = pci_find_capability(dev, PCI_CAP_ID_PM); - if (!pm) - return; - /* Check device's ability to generate PME# */ - pci_read_config_word(dev, pm + PCI_PM_PMC, &pmc); - - if ((pmc & PCI_PM_CAP_VER_MASK) > 3) { - dev_err(&dev->dev, "unsupported PM cap regs version (%u)\n", - pmc & PCI_PM_CAP_VER_MASK); - return; - } - - dev->pm_cap = pm; - - dev->d1_support = false; - dev->d2_support = false; - if (!pci_no_d1d2(dev)) { - if (pmc & PCI_PM_CAP_D1) - dev->d1_support = true; - if (pmc & PCI_PM_CAP_D2) - dev->d2_support = true; - - if (dev->d1_support || dev->d2_support) - dev_printk(KERN_DEBUG, &dev->dev, "supports%s%s\n", - dev->d1_support ? " D1" : "", - dev->d2_support ? " D2" : ""); - } - - pmc &= PCI_PM_CAP_PME_MASK; - if (pmc) { - dev_info(&dev->dev, "PME# supported from%s%s%s%s%s\n", - (pmc & PCI_PM_CAP_PME_D0) ? " D0" : "", - (pmc & PCI_PM_CAP_PME_D1) ? " D1" : "", - (pmc & PCI_PM_CAP_PME_D2) ? " D2" : "", - (pmc & PCI_PM_CAP_PME_D3) ? " D3hot" : "", - (pmc & PCI_PM_CAP_PME_D3cold) ? " D3cold" : ""); - dev->pme_support = pmc >> PCI_PM_CAP_PME_SHIFT; - /* - * Make device's PM flags reflect the wake-up capability, but - * let the user space enable it to wake up the system as needed. - */ - device_set_wakeup_capable(&dev->dev, true); - device_set_wakeup_enable(&dev->dev, false); - /* Disable the PME# generation functionality */ - pci_pme_active(dev, false); - } else { - dev->pme_support = 0; - } -} - -/** - * platform_pci_wakeup_init - init platform wakeup if present - * @dev: PCI device - * - * Some devices don't have PCI PM caps but can still generate wakeup - * events through platform methods (like ACPI events). If @dev supports - * platform wakeup events, set the device flag to indicate as much. This - * may be redundant if the device also supports PCI PM caps, but double - * initialization should be safe in that case. - */ -void platform_pci_wakeup_init(struct pci_dev *dev) -{ - if (!platform_pci_can_wakeup(dev)) - return; - - device_set_wakeup_capable(&dev->dev, true); - device_set_wakeup_enable(&dev->dev, false); - platform_pci_sleep_wake(dev, false); -} - - -/** - * pci_add_save_buffer - allocate buffer for saving given capability registers - * @dev: the PCI device - * @cap: the capability to allocate the buffer for - * @size: requested size of the buffer - */ -static int pci_add_cap_save_buffer( - struct pci_dev *dev, char cap, unsigned int size) -{ - int pos; - struct pci_cap_saved_state *save_state; - - pos = pci_find_capability(dev, cap); - if (pos <= 0) - return 0; - - save_state = kzalloc(sizeof(*save_state) + size, GFP_KERNEL); - if (!save_state) - return -ENOMEM; - - save_state->cap_nr = cap; - pci_add_saved_cap(dev, save_state); - - return 0; -} - -/** - * pci_allocate_cap_save_buffers - allocate buffers for saving capabilities - * @dev: the PCI device - */ -void pci_allocate_cap_save_buffers(struct pci_dev *dev) -{ - int error; - - error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_EXP, 4 * sizeof(u16)); - if (error) - dev_err(&dev->dev, - "unable to preallocate PCI Express save buffer\n"); - - error = pci_add_cap_save_buffer(dev, PCI_CAP_ID_PCIX, sizeof(u16)); - if (error) - dev_err(&dev->dev, - "unable to preallocate PCI-X save buffer\n"); -} - -/** - * pci_restore_standard_config - restore standard config registers of PCI device - * @dev: PCI device to handle - * - * This function assumes that the device's configuration space is accessible. - * If the device needs to be powered up, the function will wait for it to - * change the state. - */ -int pci_restore_standard_config(struct pci_dev *dev) -{ - pci_power_t prev_state; - int error; - - pci_update_current_state(dev, PCI_D0); - - prev_state = dev->current_state; - if (prev_state == PCI_D0) - goto Restore; - - error = pci_raw_set_power_state(dev, PCI_D0, false); - if (error) - return error; - - /* - * This assumes that we won't get a bus in B2 or B3 from the BIOS, but - * we've made this assumption forever and it appears to be universally - * satisfied. - */ - switch(prev_state) { - case PCI_D3cold: - case PCI_D3hot: - mdelay(pci_pm_d3_delay); - break; - case PCI_D2: - udelay(PCI_PM_D2_DELAY); - break; - } - - pci_update_current_state(dev, PCI_D0); - - Restore: - return dev->state_saved ? pci_restore_state(dev) : 0; -} - -/** - * pci_enable_ari - enable ARI forwarding if hardware support it - * @dev: the PCI device - */ -void pci_enable_ari(struct pci_dev *dev) -{ - int pos; - u32 cap; - u16 ctrl; - struct pci_dev *bridge; - - if (!dev->is_pcie || dev->devfn) - return; - - pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ARI); - if (!pos) - return; - - bridge = dev->bus->self; - if (!bridge || !bridge->is_pcie) - return; - - pos = pci_find_capability(bridge, PCI_CAP_ID_EXP); - if (!pos) - return; - - pci_read_config_dword(bridge, pos + PCI_EXP_DEVCAP2, &cap); - if (!(cap & PCI_EXP_DEVCAP2_ARI)) - return; - - pci_read_config_word(bridge, pos + PCI_EXP_DEVCTL2, &ctrl); - ctrl |= PCI_EXP_DEVCTL2_ARI; - pci_write_config_word(bridge, pos + PCI_EXP_DEVCTL2, ctrl); - - bridge->ari_enabled = 1; -} - -/** - * pci_swizzle_interrupt_pin - swizzle INTx for device behind bridge - * @dev: the PCI device - * @pin: the INTx pin (1=INTA, 2=INTB, 3=INTD, 4=INTD) - * - * Perform INTx swizzling for a device behind one level of bridge. This is - * required by section 9.1 of the PCI-to-PCI bridge specification for devices - * behind bridges on add-in cards. - */ -u8 pci_swizzle_interrupt_pin(struct pci_dev *dev, u8 pin) -{ - return (((pin - 1) + PCI_SLOT(dev->devfn)) % 4) + 1; -} - -int -pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge) -{ - u8 pin; - - pin = dev->pin; - if (!pin) - return -1; - - while (dev->bus->self) { - pin = pci_swizzle_interrupt_pin(dev, pin); - dev = dev->bus->self; - } - *bridge = dev; - return pin; -} - -/** - * pci_common_swizzle - swizzle INTx all the way to root bridge - * @dev: the PCI device - * @pinp: pointer to the INTx pin value (1=INTA, 2=INTB, 3=INTD, 4=INTD) - * - * Perform INTx swizzling for a device. This traverses through all PCI-to-PCI - * bridges all the way up to a PCI root bus. - */ -u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp) -{ - u8 pin = *pinp; - - while (dev->bus->self) { - pin = pci_swizzle_interrupt_pin(dev, pin); - dev = dev->bus->self; - } - *pinp = pin; - return PCI_SLOT(dev->devfn); -} - -/** - * pci_release_region - Release a PCI bar - * @pdev: PCI device whose resources were previously reserved by pci_request_region - * @bar: BAR to release - * - * Releases the PCI I/O and memory resources previously reserved by a - * successful call to pci_request_region. Call this function only - * after all use of the PCI regions has ceased. - */ -void pci_release_region(struct pci_dev *pdev, int bar) -{ - struct pci_devres *dr; - - if (pci_resource_len(pdev, bar) == 0) - return; - if (pci_resource_flags(pdev, bar) & IORESOURCE_IO) - release_region(pci_resource_start(pdev, bar), - pci_resource_len(pdev, bar)); - else if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) - release_mem_region(pci_resource_start(pdev, bar), - pci_resource_len(pdev, bar)); - - dr = find_pci_dr(pdev); - if (dr) - dr->region_mask &= ~(1 << bar); -} - -/** - * __pci_request_region - Reserved PCI I/O and memory resource - * @pdev: PCI device whose resources are to be reserved - * @bar: BAR to be reserved - * @res_name: Name to be associated with resource. - * @exclusive: whether the region access is exclusive or not - * - * Mark the PCI region associated with PCI device @pdev BR @bar as - * being reserved by owner @res_name. Do not access any - * address inside the PCI regions unless this call returns - * successfully. - * - * If @exclusive is set, then the region is marked so that userspace - * is explicitly not allowed to map the resource via /dev/mem or - * sysfs MMIO access. - * - * Returns 0 on success, or %EBUSY on error. A warning - * message is also printed on failure. - */ -static int __pci_request_region(struct pci_dev *pdev, int bar, const char *res_name, - int exclusive) -{ - struct pci_devres *dr; - - if (pci_resource_len(pdev, bar) == 0) - return 0; - - if (pci_resource_flags(pdev, bar) & IORESOURCE_IO) { - if (!request_region(pci_resource_start(pdev, bar), - pci_resource_len(pdev, bar), res_name)) - goto err_out; - } - else if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) { - if (!__request_mem_region(pci_resource_start(pdev, bar), - pci_resource_len(pdev, bar), res_name, - exclusive)) - goto err_out; - } - - dr = find_pci_dr(pdev); - if (dr) - dr->region_mask |= 1 << bar; - - return 0; - -err_out: - dev_warn(&pdev->dev, "BAR %d: can't reserve %s region %pR\n", - bar, - pci_resource_flags(pdev, bar) & IORESOURCE_IO ? "I/O" : "mem", - &pdev->resource[bar]); - return -EBUSY; -} - -/** - * pci_request_region - Reserve PCI I/O and memory resource - * @pdev: PCI device whose resources are to be reserved - * @bar: BAR to be reserved - * @res_name: Name to be associated with resource - * - * Mark the PCI region associated with PCI device @pdev BAR @bar as - * being reserved by owner @res_name. Do not access any - * address inside the PCI regions unless this call returns - * successfully. - * - * Returns 0 on success, or %EBUSY on error. A warning - * message is also printed on failure. - */ -int pci_request_region(struct pci_dev *pdev, int bar, const char *res_name) -{ - return __pci_request_region(pdev, bar, res_name, 0); -} - -/** - * pci_request_region_exclusive - Reserved PCI I/O and memory resource - * @pdev: PCI device whose resources are to be reserved - * @bar: BAR to be reserved - * @res_name: Name to be associated with resource. - * - * Mark the PCI region associated with PCI device @pdev BR @bar as - * being reserved by owner @res_name. Do not access any - * address inside the PCI regions unless this call returns - * successfully. - * - * Returns 0 on success, or %EBUSY on error. A warning - * message is also printed on failure. - * - * The key difference that _exclusive makes it that userspace is - * explicitly not allowed to map the resource via /dev/mem or - * sysfs. - */ -int pci_request_region_exclusive(struct pci_dev *pdev, int bar, const char *res_name) -{ - return __pci_request_region(pdev, bar, res_name, IORESOURCE_EXCLUSIVE); -} -/** - * pci_release_selected_regions - Release selected PCI I/O and memory resources - * @pdev: PCI device whose resources were previously reserved - * @bars: Bitmask of BARs to be released - * - * Release selected PCI I/O and memory resources previously reserved. - * Call this function only after all use of the PCI regions has ceased. - */ -void pci_release_selected_regions(struct pci_dev *pdev, int bars) -{ - int i; - - for (i = 0; i < 6; i++) - if (bars & (1 << i)) - pci_release_region(pdev, i); -} - -int __pci_request_selected_regions(struct pci_dev *pdev, int bars, - const char *res_name, int excl) -{ - int i; - - for (i = 0; i < 6; i++) - if (bars & (1 << i)) - if (__pci_request_region(pdev, i, res_name, excl)) - goto err_out; - return 0; - -err_out: - while(--i >= 0) - if (bars & (1 << i)) - pci_release_region(pdev, i); - - return -EBUSY; -} - - -/** - * pci_request_selected_regions - Reserve selected PCI I/O and memory resources - * @pdev: PCI device whose resources are to be reserved - * @bars: Bitmask of BARs to be requested - * @res_name: Name to be associated with resource - */ -int pci_request_selected_regions(struct pci_dev *pdev, int bars, - const char *res_name) -{ - return __pci_request_selected_regions(pdev, bars, res_name, 0); -} - -int pci_request_selected_regions_exclusive(struct pci_dev *pdev, - int bars, const char *res_name) -{ - return __pci_request_selected_regions(pdev, bars, res_name, - IORESOURCE_EXCLUSIVE); -} - -/** - * pci_release_regions - Release reserved PCI I/O and memory resources - * @pdev: PCI device whose resources were previously reserved by pci_request_regions - * - * Releases all PCI I/O and memory resources previously reserved by a - * successful call to pci_request_regions. Call this function only - * after all use of the PCI regions has ceased. - */ - -void pci_release_regions(struct pci_dev *pdev) -{ - pci_release_selected_regions(pdev, (1 << 6) - 1); -} - -/** - * pci_request_regions - Reserved PCI I/O and memory resources - * @pdev: PCI device whose resources are to be reserved - * @res_name: Name to be associated with resource. - * - * Mark all PCI regions associated with PCI device @pdev as - * being reserved by owner @res_name. Do not access any - * address inside the PCI regions unless this call returns - * successfully. - * - * Returns 0 on success, or %EBUSY on error. A warning - * message is also printed on failure. - */ -int pci_request_regions(struct pci_dev *pdev, const char *res_name) -{ - return pci_request_selected_regions(pdev, ((1 << 6) - 1), res_name); -} - -#ifndef DDE_LINUX -/** - * pci_request_regions_exclusive - Reserved PCI I/O and memory resources - * @pdev: PCI device whose resources are to be reserved - * @res_name: Name to be associated with resource. - * - * Mark all PCI regions associated with PCI device @pdev as - * being reserved by owner @res_name. Do not access any - * address inside the PCI regions unless this call returns - * successfully. - * - * pci_request_regions_exclusive() will mark the region so that - * /dev/mem and the sysfs MMIO access will not be allowed. - * - * Returns 0 on success, or %EBUSY on error. A warning - * message is also printed on failure. - */ -int pci_request_regions_exclusive(struct pci_dev *pdev, const char *res_name) -{ - return pci_request_selected_regions_exclusive(pdev, - ((1 << 6) - 1), res_name); -} - -static void __pci_set_master(struct pci_dev *dev, bool enable) -{ - u16 old_cmd, cmd; - - pci_read_config_word(dev, PCI_COMMAND, &old_cmd); - if (enable) - cmd = old_cmd | PCI_COMMAND_MASTER; - else - cmd = old_cmd & ~PCI_COMMAND_MASTER; - if (cmd != old_cmd) { - dev_dbg(&dev->dev, "%s bus mastering\n", - enable ? "enabling" : "disabling"); - pci_write_config_word(dev, PCI_COMMAND, cmd); - } - dev->is_busmaster = enable; -} - -/** - * pci_set_master - enables bus-mastering for device dev - * @dev: the PCI device to enable - * - * Enables bus-mastering on the device and calls pcibios_set_master() - * to do the needed arch specific settings. - */ -void pci_set_master(struct pci_dev *dev) -{ - __pci_set_master(dev, true); - pcibios_set_master(dev); -} - -/** - * pci_clear_master - disables bus-mastering for device dev - * @dev: the PCI device to disable - */ -void pci_clear_master(struct pci_dev *dev) -{ - __pci_set_master(dev, false); -} -#endif /* DDE_LINUX */ - -#ifdef PCI_DISABLE_MWI -int pci_set_mwi(struct pci_dev *dev) -{ - return 0; -} - -int pci_try_set_mwi(struct pci_dev *dev) -{ - return 0; -} - -void pci_clear_mwi(struct pci_dev *dev) -{ -} - -#else - -#ifndef PCI_CACHE_LINE_BYTES -#define PCI_CACHE_LINE_BYTES L1_CACHE_BYTES -#endif - -/* This can be overridden by arch code. */ -/* Don't forget this is measured in 32-bit words, not bytes */ -u8 pci_cache_line_size = PCI_CACHE_LINE_BYTES / 4; - -/** - * pci_set_cacheline_size - ensure the CACHE_LINE_SIZE register is programmed - * @dev: the PCI device for which MWI is to be enabled - * - * Helper function for pci_set_mwi. - * Originally copied from drivers/net/acenic.c. - * Copyright 1998-2001 by Jes Sorensen, <jes@trained-monkey.org>. - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -static int -pci_set_cacheline_size(struct pci_dev *dev) -{ - u8 cacheline_size; - - if (!pci_cache_line_size) - return -EINVAL; /* The system doesn't support MWI. */ - - /* Validate current setting: the PCI_CACHE_LINE_SIZE must be - equal to or multiple of the right value. */ - pci_read_config_byte(dev, PCI_CACHE_LINE_SIZE, &cacheline_size); - if (cacheline_size >= pci_cache_line_size && - (cacheline_size % pci_cache_line_size) == 0) - return 0; - - /* Write the correct value. */ - pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, pci_cache_line_size); - /* Read it back. */ - pci_read_config_byte(dev, PCI_CACHE_LINE_SIZE, &cacheline_size); - if (cacheline_size == pci_cache_line_size) - return 0; - - dev_printk(KERN_DEBUG, &dev->dev, "cache line size of %d is not " - "supported\n", pci_cache_line_size << 2); - - return -EINVAL; -} - -/** - * pci_set_mwi - enables memory-write-invalidate PCI transaction - * @dev: the PCI device for which MWI is enabled - * - * Enables the Memory-Write-Invalidate transaction in %PCI_COMMAND. - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -int -pci_set_mwi(struct pci_dev *dev) -{ - int rc; - u16 cmd; - - rc = pci_set_cacheline_size(dev); - if (rc) - return rc; - - pci_read_config_word(dev, PCI_COMMAND, &cmd); - if (! (cmd & PCI_COMMAND_INVALIDATE)) { - dev_dbg(&dev->dev, "enabling Mem-Wr-Inval\n"); - cmd |= PCI_COMMAND_INVALIDATE; - pci_write_config_word(dev, PCI_COMMAND, cmd); - } - - return 0; -} - -/** - * pci_try_set_mwi - enables memory-write-invalidate PCI transaction - * @dev: the PCI device for which MWI is enabled - * - * Enables the Memory-Write-Invalidate transaction in %PCI_COMMAND. - * Callers are not required to check the return value. - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -int pci_try_set_mwi(struct pci_dev *dev) -{ - int rc = pci_set_mwi(dev); - return rc; -} - -/** - * pci_clear_mwi - disables Memory-Write-Invalidate for device dev - * @dev: the PCI device to disable - * - * Disables PCI Memory-Write-Invalidate transaction on the device - */ -void -pci_clear_mwi(struct pci_dev *dev) -{ - u16 cmd; - - pci_read_config_word(dev, PCI_COMMAND, &cmd); - if (cmd & PCI_COMMAND_INVALIDATE) { - cmd &= ~PCI_COMMAND_INVALIDATE; - pci_write_config_word(dev, PCI_COMMAND, cmd); - } -} -#endif /* ! PCI_DISABLE_MWI */ - -/** - * pci_intx - enables/disables PCI INTx for device dev - * @pdev: the PCI device to operate on - * @enable: boolean: whether to enable or disable PCI INTx - * - * Enables/disables PCI INTx for device dev - */ -void -pci_intx(struct pci_dev *pdev, int enable) -{ - u16 pci_command, new; - - pci_read_config_word(pdev, PCI_COMMAND, &pci_command); - - if (enable) { - new = pci_command & ~PCI_COMMAND_INTX_DISABLE; - } else { - new = pci_command | PCI_COMMAND_INTX_DISABLE; - } - - if (new != pci_command) { - struct pci_devres *dr; - - pci_write_config_word(pdev, PCI_COMMAND, new); - - dr = find_pci_dr(pdev); - if (dr && !dr->restore_intx) { - dr->restore_intx = 1; - dr->orig_intx = !enable; - } - } -} - -/** - * pci_msi_off - disables any msi or msix capabilities - * @dev: the PCI device to operate on - * - * If you want to use msi see pci_enable_msi and friends. - * This is a lower level primitive that allows us to disable - * msi operation at the device level. - */ -void pci_msi_off(struct pci_dev *dev) -{ - int pos; - u16 control; - - pos = pci_find_capability(dev, PCI_CAP_ID_MSI); - if (pos) { - pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); - control &= ~PCI_MSI_FLAGS_ENABLE; - pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control); - } - pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - if (pos) { - pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control); - control &= ~PCI_MSIX_FLAGS_ENABLE; - pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control); - } -} - -#ifndef HAVE_ARCH_PCI_SET_DMA_MASK -/* - * These can be overridden by arch-specific implementations - */ -int -pci_set_dma_mask(struct pci_dev *dev, u64 mask) -{ - if (!pci_dma_supported(dev, mask)) - return -EIO; - - dev->dma_mask = mask; - - return 0; -} - -int -pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask) -{ - if (!pci_dma_supported(dev, mask)) - return -EIO; - - dev->dev.coherent_dma_mask = mask; - - return 0; -} -#endif - -#ifndef HAVE_ARCH_PCI_SET_DMA_MAX_SEGMENT_SIZE -int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size) -{ - return dma_set_max_seg_size(&dev->dev, size); -} -EXPORT_SYMBOL(pci_set_dma_max_seg_size); -#endif - -#ifndef HAVE_ARCH_PCI_SET_DMA_SEGMENT_BOUNDARY -int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask) -{ - return dma_set_seg_boundary(&dev->dev, mask); -} -EXPORT_SYMBOL(pci_set_dma_seg_boundary); -#endif - -static int __pcie_flr(struct pci_dev *dev, int probe) -{ - u16 status; - u32 cap; - int exppos = pci_find_capability(dev, PCI_CAP_ID_EXP); - - if (!exppos) - return -ENOTTY; - pci_read_config_dword(dev, exppos + PCI_EXP_DEVCAP, &cap); - if (!(cap & PCI_EXP_DEVCAP_FLR)) - return -ENOTTY; - - if (probe) - return 0; - - pci_block_user_cfg_access(dev); - - /* Wait for Transaction Pending bit clean */ - msleep(100); - pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status); - if (status & PCI_EXP_DEVSTA_TRPND) { - dev_info(&dev->dev, "Busy after 100ms while trying to reset; " - "sleeping for 1 second\n"); - ssleep(1); - pci_read_config_word(dev, exppos + PCI_EXP_DEVSTA, &status); - if (status & PCI_EXP_DEVSTA_TRPND) - dev_info(&dev->dev, "Still busy after 1s; " - "proceeding with reset anyway\n"); - } - - pci_write_config_word(dev, exppos + PCI_EXP_DEVCTL, - PCI_EXP_DEVCTL_BCR_FLR); - mdelay(100); - - pci_unblock_user_cfg_access(dev); - return 0; -} - -static int __pci_af_flr(struct pci_dev *dev, int probe) -{ - int cappos = pci_find_capability(dev, PCI_CAP_ID_AF); - u8 status; - u8 cap; - - if (!cappos) - return -ENOTTY; - pci_read_config_byte(dev, cappos + PCI_AF_CAP, &cap); - if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR)) - return -ENOTTY; - - if (probe) - return 0; - - pci_block_user_cfg_access(dev); - - /* Wait for Transaction Pending bit clean */ - msleep(100); - pci_read_config_byte(dev, cappos + PCI_AF_STATUS, &status); - if (status & PCI_AF_STATUS_TP) { - dev_info(&dev->dev, "Busy after 100ms while trying to" - " reset; sleeping for 1 second\n"); - ssleep(1); - pci_read_config_byte(dev, - cappos + PCI_AF_STATUS, &status); - if (status & PCI_AF_STATUS_TP) - dev_info(&dev->dev, "Still busy after 1s; " - "proceeding with reset anyway\n"); - } - pci_write_config_byte(dev, cappos + PCI_AF_CTRL, PCI_AF_CTRL_FLR); - mdelay(100); - - pci_unblock_user_cfg_access(dev); - return 0; -} - -static int __pci_reset_function(struct pci_dev *pdev, int probe) -{ - int res; - - res = __pcie_flr(pdev, probe); - if (res != -ENOTTY) - return res; - - res = __pci_af_flr(pdev, probe); - if (res != -ENOTTY) - return res; - - return res; -} - -/** - * pci_execute_reset_function() - Reset a PCI device function - * @dev: Device function to reset - * - * Some devices allow an individual function to be reset without affecting - * other functions in the same device. The PCI device must be responsive - * to PCI config space in order to use this function. - * - * The device function is presumed to be unused when this function is called. - * Resetting the device will make the contents of PCI configuration space - * random, so any caller of this must be prepared to reinitialise the - * device including MSI, bus mastering, BARs, decoding IO and memory spaces, - * etc. - * - * Returns 0 if the device function was successfully reset or -ENOTTY if the - * device doesn't support resetting a single function. - */ -int pci_execute_reset_function(struct pci_dev *dev) -{ - return __pci_reset_function(dev, 0); -} -EXPORT_SYMBOL_GPL(pci_execute_reset_function); - -/** - * pci_reset_function() - quiesce and reset a PCI device function - * @dev: Device function to reset - * - * Some devices allow an individual function to be reset without affecting - * other functions in the same device. The PCI device must be responsive - * to PCI config space in order to use this function. - * - * This function does not just reset the PCI portion of a device, but - * clears all the state associated with the device. This function differs - * from pci_execute_reset_function in that it saves and restores device state - * over the reset. - * - * Returns 0 if the device function was successfully reset or -ENOTTY if the - * device doesn't support resetting a single function. - */ -int pci_reset_function(struct pci_dev *dev) -{ - int r = __pci_reset_function(dev, 1); - - if (r < 0) - return r; - - if (!dev->msi_enabled && !dev->msix_enabled && dev->irq != 0) - disable_irq(dev->irq); - pci_save_state(dev); - - pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); - - r = pci_execute_reset_function(dev); - - pci_restore_state(dev); - if (!dev->msi_enabled && !dev->msix_enabled && dev->irq != 0) - enable_irq(dev->irq); - - return r; -} -EXPORT_SYMBOL_GPL(pci_reset_function); - -/** - * pcix_get_max_mmrbc - get PCI-X maximum designed memory read byte count - * @dev: PCI device to query - * - * Returns mmrbc: maximum designed memory read count in bytes - * or appropriate error value. - */ -int pcix_get_max_mmrbc(struct pci_dev *dev) -{ - int err, cap; - u32 stat; - - cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); - if (!cap) - return -EINVAL; - - err = pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat); - if (err) - return -EINVAL; - - return (stat & PCI_X_STATUS_MAX_READ) >> 12; -} -EXPORT_SYMBOL(pcix_get_max_mmrbc); - -/** - * pcix_get_mmrbc - get PCI-X maximum memory read byte count - * @dev: PCI device to query - * - * Returns mmrbc: maximum memory read count in bytes - * or appropriate error value. - */ -int pcix_get_mmrbc(struct pci_dev *dev) -{ - int ret, cap; - u32 cmd; - - cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); - if (!cap) - return -EINVAL; - - ret = pci_read_config_dword(dev, cap + PCI_X_CMD, &cmd); - if (!ret) - ret = 512 << ((cmd & PCI_X_CMD_MAX_READ) >> 2); - - return ret; -} -EXPORT_SYMBOL(pcix_get_mmrbc); - -/** - * pcix_set_mmrbc - set PCI-X maximum memory read byte count - * @dev: PCI device to query - * @mmrbc: maximum memory read count in bytes - * valid values are 512, 1024, 2048, 4096 - * - * If possible sets maximum memory read byte count, some bridges have erratas - * that prevent this. - */ -int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc) -{ - int cap, err = -EINVAL; - u32 stat, cmd, v, o; - - if (mmrbc < 512 || mmrbc > 4096 || !is_power_of_2(mmrbc)) - goto out; - - v = ffs(mmrbc) - 10; - - cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); - if (!cap) - goto out; - - err = pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat); - if (err) - goto out; - - if (v > (stat & PCI_X_STATUS_MAX_READ) >> 21) - return -E2BIG; - - err = pci_read_config_dword(dev, cap + PCI_X_CMD, &cmd); - if (err) - goto out; - - o = (cmd & PCI_X_CMD_MAX_READ) >> 2; - if (o != v) { - if (v > o && dev->bus && - (dev->bus->bus_flags & PCI_BUS_FLAGS_NO_MMRBC)) - return -EIO; - - cmd &= ~PCI_X_CMD_MAX_READ; - cmd |= v << 2; - err = pci_write_config_dword(dev, cap + PCI_X_CMD, cmd); - } -out: - return err; -} -EXPORT_SYMBOL(pcix_set_mmrbc); - -/** - * pcie_get_readrq - get PCI Express read request size - * @dev: PCI device to query - * - * Returns maximum memory read request in bytes - * or appropriate error value. - */ -int pcie_get_readrq(struct pci_dev *dev) -{ - int ret, cap; - u16 ctl; - - cap = pci_find_capability(dev, PCI_CAP_ID_EXP); - if (!cap) - return -EINVAL; - - ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); - if (!ret) - ret = 128 << ((ctl & PCI_EXP_DEVCTL_READRQ) >> 12); - - return ret; -} -EXPORT_SYMBOL(pcie_get_readrq); - -/** - * pcie_set_readrq - set PCI Express maximum memory read request - * @dev: PCI device to query - * @rq: maximum memory read count in bytes - * valid values are 128, 256, 512, 1024, 2048, 4096 - * - * If possible sets maximum read byte count - */ -int pcie_set_readrq(struct pci_dev *dev, int rq) -{ - int cap, err = -EINVAL; - u16 ctl, v; - - if (rq < 128 || rq > 4096 || !is_power_of_2(rq)) - goto out; - - v = (ffs(rq) - 8) << 12; - - cap = pci_find_capability(dev, PCI_CAP_ID_EXP); - if (!cap) - goto out; - - err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); - if (err) - goto out; - - if ((ctl & PCI_EXP_DEVCTL_READRQ) != v) { - ctl &= ~PCI_EXP_DEVCTL_READRQ; - ctl |= v; - err = pci_write_config_dword(dev, cap + PCI_EXP_DEVCTL, ctl); - } - -out: - return err; -} -EXPORT_SYMBOL(pcie_set_readrq); - -/** - * pci_select_bars - Make BAR mask from the type of resource - * @dev: the PCI device for which BAR mask is made - * @flags: resource type mask to be selected - * - * This helper routine makes bar mask from the type of resource. - */ -int pci_select_bars(struct pci_dev *dev, unsigned long flags) -{ - int i, bars = 0; - for (i = 0; i < PCI_NUM_RESOURCES; i++) - if (pci_resource_flags(dev, i) & flags) - bars |= (1 << i); - return bars; -} - -/** - * pci_resource_bar - get position of the BAR associated with a resource - * @dev: the PCI device - * @resno: the resource number - * @type: the BAR type to be filled in - * - * Returns BAR position in config space, or 0 if the BAR is invalid. - */ -int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type) -{ - if (resno < PCI_ROM_RESOURCE) { - *type = pci_bar_unknown; - return PCI_BASE_ADDRESS_0 + 4 * resno; - } else if (resno == PCI_ROM_RESOURCE) { - *type = pci_bar_mem32; - return dev->rom_base_reg; - } - - dev_err(&dev->dev, "BAR: invalid resource #%d\n", resno); - return 0; -} - -static void __devinit pci_no_domains(void) -{ -#ifdef CONFIG_PCI_DOMAINS - pci_domains_supported = 0; -#endif -} - -/** - * pci_ext_cfg_enabled - can we access extended PCI config space? - * @dev: The PCI device of the root bridge. - * - * Returns 1 if we can access PCI extended config space (offsets - * greater than 0xff). This is the default implementation. Architecture - * implementations can override this. - */ -int __attribute__ ((weak)) pci_ext_cfg_avail(struct pci_dev *dev) -{ - return 1; -} - -#ifndef DDE_LINUX -static -#endif -int __devinit pci_init(void) -{ - struct pci_dev *dev = NULL; - - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - pci_fixup_device(pci_fixup_final, dev); - } - - return 0; -} - -static int __init pci_setup(char *str) -{ -#ifndef DDE_LINUX - while (str) { - char *k = strchr(str, ','); - if (k) - *k++ = 0; - if (*str && (str = pcibios_setup(str)) && *str) { - if (!strcmp(str, "nomsi")) { - pci_no_msi(); - } else if (!strcmp(str, "noaer")) { - pci_no_aer(); - } else if (!strcmp(str, "nodomains")) { - pci_no_domains(); - } else if (!strncmp(str, "cbiosize=", 9)) { - pci_cardbus_io_size = memparse(str + 9, &str); - } else if (!strncmp(str, "cbmemsize=", 10)) { - pci_cardbus_mem_size = memparse(str + 10, &str); - } else { - printk(KERN_ERR "PCI: Unknown option `%s'\n", - str); - } - } - str = k; - } -#endif - return 0; -} -early_param("pci", pci_setup); - -device_initcall(pci_init); - -EXPORT_SYMBOL(pci_reenable_device); -EXPORT_SYMBOL(pci_enable_device_io); -EXPORT_SYMBOL(pci_enable_device_mem); -EXPORT_SYMBOL(pci_enable_device); -EXPORT_SYMBOL(pcim_enable_device); -EXPORT_SYMBOL(pcim_pin_device); -EXPORT_SYMBOL(pci_disable_device); -EXPORT_SYMBOL(pci_find_capability); -EXPORT_SYMBOL(pci_bus_find_capability); -EXPORT_SYMBOL(pci_release_regions); -EXPORT_SYMBOL(pci_request_regions); -EXPORT_SYMBOL(pci_request_regions_exclusive); -EXPORT_SYMBOL(pci_release_region); -EXPORT_SYMBOL(pci_request_region); -EXPORT_SYMBOL(pci_request_region_exclusive); -EXPORT_SYMBOL(pci_release_selected_regions); -EXPORT_SYMBOL(pci_request_selected_regions); -EXPORT_SYMBOL(pci_request_selected_regions_exclusive); -EXPORT_SYMBOL(pci_set_master); -EXPORT_SYMBOL(pci_clear_master); -EXPORT_SYMBOL(pci_set_mwi); -EXPORT_SYMBOL(pci_try_set_mwi); -EXPORT_SYMBOL(pci_clear_mwi); -EXPORT_SYMBOL_GPL(pci_intx); -EXPORT_SYMBOL(pci_set_dma_mask); -EXPORT_SYMBOL(pci_set_consistent_dma_mask); -EXPORT_SYMBOL(pci_assign_resource); -EXPORT_SYMBOL(pci_find_parent_resource); -EXPORT_SYMBOL(pci_select_bars); - -EXPORT_SYMBOL(pci_set_power_state); -EXPORT_SYMBOL(pci_save_state); -EXPORT_SYMBOL(pci_restore_state); -EXPORT_SYMBOL(pci_pme_capable); -EXPORT_SYMBOL(pci_pme_active); -EXPORT_SYMBOL(pci_enable_wake); -EXPORT_SYMBOL(pci_wake_from_d3); -EXPORT_SYMBOL(pci_target_state); -EXPORT_SYMBOL(pci_prepare_to_sleep); -EXPORT_SYMBOL(pci_back_from_sleep); -EXPORT_SYMBOL_GPL(pci_set_pcie_reset_state); - diff --git a/libdde_linux26/lib/src/drivers/pci/.svn/text-base/probe.c.svn-base b/libdde_linux26/lib/src/drivers/pci/.svn/text-base/probe.c.svn-base deleted file mode 100644 index 32da5108..00000000 --- a/libdde_linux26/lib/src/drivers/pci/.svn/text-base/probe.c.svn-base +++ /dev/null @@ -1,1232 +0,0 @@ -/* - * probe.c - PCI detection and setup code - */ - -#include <linux/kernel.h> -#include <linux/delay.h> -#include <linux/device.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/cpumask.h> -#include <linux/pci-aspm.h> -#include "pci.h" - -#define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */ -#define CARDBUS_RESERVE_BUSNR 3 - -/* Ugh. Need to stop exporting this to modules. */ -LIST_HEAD(pci_root_buses); -EXPORT_SYMBOL(pci_root_buses); - -#ifdef DDE_LINUX -#include "local.h" -#endif - -static int find_anything(struct device *dev, void *data) -{ - return 1; -} - -/* - * Some device drivers need know if pci is initiated. - * Basically, we think pci is not initiated when there - * is no device to be found on the pci_bus_type. - */ -int no_pci_devices(void) -{ - struct device *dev; - int no_devices; - - dev = bus_find_device(&pci_bus_type, NULL, NULL, find_anything); - no_devices = (dev == NULL); - put_device(dev); - return no_devices; -} -EXPORT_SYMBOL(no_pci_devices); - -/* - * PCI Bus Class Devices - */ -static ssize_t pci_bus_show_cpuaffinity(struct device *dev, - int type, - struct device_attribute *attr, - char *buf) -{ - int ret; - const struct cpumask *cpumask; - - cpumask = cpumask_of_pcibus(to_pci_bus(dev)); - ret = type? - cpulist_scnprintf(buf, PAGE_SIZE-2, cpumask) : - cpumask_scnprintf(buf, PAGE_SIZE-2, cpumask); - buf[ret++] = '\n'; - buf[ret] = '\0'; - return ret; -} - -static ssize_t inline pci_bus_show_cpumaskaffinity(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return pci_bus_show_cpuaffinity(dev, 0, attr, buf); -} - -static ssize_t inline pci_bus_show_cpulistaffinity(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return pci_bus_show_cpuaffinity(dev, 1, attr, buf); -} - -DEVICE_ATTR(cpuaffinity, S_IRUGO, pci_bus_show_cpumaskaffinity, NULL); -DEVICE_ATTR(cpulistaffinity, S_IRUGO, pci_bus_show_cpulistaffinity, NULL); - -/* - * PCI Bus Class - */ -static void release_pcibus_dev(struct device *dev) -{ - struct pci_bus *pci_bus = to_pci_bus(dev); - - if (pci_bus->bridge) - put_device(pci_bus->bridge); - kfree(pci_bus); -} - -static struct class pcibus_class = { - .name = "pci_bus", - .dev_release = &release_pcibus_dev, -}; - -static int __init pcibus_class_init(void) -{ - return class_register(&pcibus_class); -} -postcore_initcall(pcibus_class_init); - -/* - * Translate the low bits of the PCI base - * to the resource type - */ -static inline unsigned int pci_calc_resource_flags(unsigned int flags) -{ - if (flags & PCI_BASE_ADDRESS_SPACE_IO) - return IORESOURCE_IO; - - if (flags & PCI_BASE_ADDRESS_MEM_PREFETCH) - return IORESOURCE_MEM | IORESOURCE_PREFETCH; - - return IORESOURCE_MEM; -} - -static u64 pci_size(u64 base, u64 maxbase, u64 mask) -{ - u64 size = mask & maxbase; /* Find the significant bits */ - if (!size) - return 0; - - /* Get the lowest of them to find the decode size, and - from that the extent. */ - size = (size & ~(size-1)) - 1; - - /* base == maxbase can be valid only if the BAR has - already been programmed with all 1s. */ - if (base == maxbase && ((base | size) & mask) != mask) - return 0; - - return size; -} - -static inline enum pci_bar_type decode_bar(struct resource *res, u32 bar) -{ - if ((bar & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) { - res->flags = bar & ~PCI_BASE_ADDRESS_IO_MASK; - return pci_bar_io; - } - - res->flags = bar & ~PCI_BASE_ADDRESS_MEM_MASK; - - if (res->flags & PCI_BASE_ADDRESS_MEM_TYPE_64) - return pci_bar_mem64; - return pci_bar_mem32; -} - -/** - * pci_read_base - read a PCI BAR - * @dev: the PCI device - * @type: type of the BAR - * @res: resource buffer to be filled in - * @pos: BAR position in the config space - * - * Returns 1 if the BAR is 64-bit, or 0 if 32-bit. - */ -int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, - struct resource *res, unsigned int pos) -{ - u32 l, sz, mask; - - mask = type ? ~PCI_ROM_ADDRESS_ENABLE : ~0; - - res->name = pci_name(dev); - - pci_read_config_dword(dev, pos, &l); - pci_write_config_dword(dev, pos, mask); - pci_read_config_dword(dev, pos, &sz); - pci_write_config_dword(dev, pos, l); - - /* - * All bits set in sz means the device isn't working properly. - * If the BAR isn't implemented, all bits must be 0. If it's a - * memory BAR or a ROM, bit 0 must be clear; if it's an io BAR, bit - * 1 must be clear. - */ - if (!sz || sz == 0xffffffff) - goto fail; - - /* - * I don't know how l can have all bits set. Copied from old code. - * Maybe it fixes a bug on some ancient platform. - */ - if (l == 0xffffffff) - l = 0; - - if (type == pci_bar_unknown) { - type = decode_bar(res, l); - res->flags |= pci_calc_resource_flags(l) | IORESOURCE_SIZEALIGN; - if (type == pci_bar_io) { - l &= PCI_BASE_ADDRESS_IO_MASK; - mask = PCI_BASE_ADDRESS_IO_MASK & 0xffff; - } else { - l &= PCI_BASE_ADDRESS_MEM_MASK; - mask = (u32)PCI_BASE_ADDRESS_MEM_MASK; - } - } else { - res->flags |= (l & IORESOURCE_ROM_ENABLE); - l &= PCI_ROM_ADDRESS_MASK; - mask = (u32)PCI_ROM_ADDRESS_MASK; - } - - if (type == pci_bar_mem64) { - u64 l64 = l; - u64 sz64 = sz; - u64 mask64 = mask | (u64)~0 << 32; - - pci_read_config_dword(dev, pos + 4, &l); - pci_write_config_dword(dev, pos + 4, ~0); - pci_read_config_dword(dev, pos + 4, &sz); - pci_write_config_dword(dev, pos + 4, l); - - l64 |= ((u64)l << 32); - sz64 |= ((u64)sz << 32); - - sz64 = pci_size(l64, sz64, mask64); - - if (!sz64) - goto fail; - - if ((sizeof(resource_size_t) < 8) && (sz64 > 0x100000000ULL)) { - dev_err(&dev->dev, "can't handle 64-bit BAR\n"); - goto fail; - } else if ((sizeof(resource_size_t) < 8) && l) { - /* Address above 32-bit boundary; disable the BAR */ - pci_write_config_dword(dev, pos, 0); - pci_write_config_dword(dev, pos + 4, 0); - res->start = 0; - res->end = sz64; - } else { - res->start = l64; - res->end = l64 + sz64; - dev_printk(KERN_DEBUG, &dev->dev, - "reg %x 64bit mmio: %pR\n", pos, res); - } - } else { - sz = pci_size(l, sz, mask); - - if (!sz) - goto fail; - - res->start = l; - res->end = l + sz; - - dev_printk(KERN_DEBUG, &dev->dev, "reg %x %s: %pR\n", pos, - (res->flags & IORESOURCE_IO) ? "io port" : "32bit mmio", - res); - } - - out: - return (type == pci_bar_mem64) ? 1 : 0; - fail: - res->flags = 0; - goto out; -} - -static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) -{ - unsigned int pos, reg; - - for (pos = 0; pos < howmany; pos++) { - struct resource *res = &dev->resource[pos]; - reg = PCI_BASE_ADDRESS_0 + (pos << 2); - pos += __pci_read_base(dev, pci_bar_unknown, res, reg); - } - - if (rom) { - struct resource *res = &dev->resource[PCI_ROM_RESOURCE]; - dev->rom_base_reg = rom; - res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH | - IORESOURCE_READONLY | IORESOURCE_CACHEABLE | - IORESOURCE_SIZEALIGN; - __pci_read_base(dev, pci_bar_mem32, res, rom); - } -} - -void __devinit pci_read_bridge_bases(struct pci_bus *child) -{ - struct pci_dev *dev = child->self; - u8 io_base_lo, io_limit_lo; - u16 mem_base_lo, mem_limit_lo; - unsigned long base, limit; - struct resource *res; - int i; - - if (!dev) /* It's a host bus, nothing to read */ - return; - - if (dev->transparent) { - dev_info(&dev->dev, "transparent bridge\n"); - for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++) - child->resource[i] = child->parent->resource[i - 3]; - } - - res = child->resource[0]; - pci_read_config_byte(dev, PCI_IO_BASE, &io_base_lo); - pci_read_config_byte(dev, PCI_IO_LIMIT, &io_limit_lo); - base = (io_base_lo & PCI_IO_RANGE_MASK) << 8; - limit = (io_limit_lo & PCI_IO_RANGE_MASK) << 8; - - if ((io_base_lo & PCI_IO_RANGE_TYPE_MASK) == PCI_IO_RANGE_TYPE_32) { - u16 io_base_hi, io_limit_hi; - pci_read_config_word(dev, PCI_IO_BASE_UPPER16, &io_base_hi); - pci_read_config_word(dev, PCI_IO_LIMIT_UPPER16, &io_limit_hi); - base |= (io_base_hi << 16); - limit |= (io_limit_hi << 16); - } - - if (base <= limit) { - res->flags = (io_base_lo & PCI_IO_RANGE_TYPE_MASK) | IORESOURCE_IO; - if (!res->start) - res->start = base; - if (!res->end) - res->end = limit + 0xfff; - dev_printk(KERN_DEBUG, &dev->dev, "bridge io port: %pR\n", res); - } - - res = child->resource[1]; - pci_read_config_word(dev, PCI_MEMORY_BASE, &mem_base_lo); - pci_read_config_word(dev, PCI_MEMORY_LIMIT, &mem_limit_lo); - base = (mem_base_lo & PCI_MEMORY_RANGE_MASK) << 16; - limit = (mem_limit_lo & PCI_MEMORY_RANGE_MASK) << 16; - if (base <= limit) { - res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM; - res->start = base; - res->end = limit + 0xfffff; - dev_printk(KERN_DEBUG, &dev->dev, "bridge 32bit mmio: %pR\n", - res); - } - - res = child->resource[2]; - pci_read_config_word(dev, PCI_PREF_MEMORY_BASE, &mem_base_lo); - pci_read_config_word(dev, PCI_PREF_MEMORY_LIMIT, &mem_limit_lo); - base = (mem_base_lo & PCI_PREF_RANGE_MASK) << 16; - limit = (mem_limit_lo & PCI_PREF_RANGE_MASK) << 16; - - if ((mem_base_lo & PCI_PREF_RANGE_TYPE_MASK) == PCI_PREF_RANGE_TYPE_64) { - u32 mem_base_hi, mem_limit_hi; - pci_read_config_dword(dev, PCI_PREF_BASE_UPPER32, &mem_base_hi); - pci_read_config_dword(dev, PCI_PREF_LIMIT_UPPER32, &mem_limit_hi); - - /* - * Some bridges set the base > limit by default, and some - * (broken) BIOSes do not initialize them. If we find - * this, just assume they are not being used. - */ - if (mem_base_hi <= mem_limit_hi) { -#if BITS_PER_LONG == 64 - base |= ((long) mem_base_hi) << 32; - limit |= ((long) mem_limit_hi) << 32; -#else - if (mem_base_hi || mem_limit_hi) { - dev_err(&dev->dev, "can't handle 64-bit " - "address space for bridge\n"); - return; - } -#endif - } - } - if (base <= limit) { - res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM | IORESOURCE_PREFETCH; - res->start = base; - res->end = limit + 0xfffff; - dev_printk(KERN_DEBUG, &dev->dev, "bridge %sbit mmio pref: %pR\n", - (res->flags & PCI_PREF_RANGE_TYPE_64) ? "64" : "32", - res); - } -} - -static struct pci_bus * pci_alloc_bus(void) -{ - struct pci_bus *b; - - b = kzalloc(sizeof(*b), GFP_KERNEL); - if (b) { - INIT_LIST_HEAD(&b->node); - INIT_LIST_HEAD(&b->children); - INIT_LIST_HEAD(&b->devices); - INIT_LIST_HEAD(&b->slots); - } - return b; -} - -static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent, - struct pci_dev *bridge, int busnr) -{ - struct pci_bus *child; - int i; - - /* - * Allocate a new bus, and inherit stuff from the parent.. - */ - child = pci_alloc_bus(); - if (!child) - return NULL; - - child->parent = parent; - child->ops = parent->ops; - child->sysdata = parent->sysdata; - child->bus_flags = parent->bus_flags; - - /* initialize some portions of the bus device, but don't register it - * now as the parent is not properly set up yet. This device will get - * registered later in pci_bus_add_devices() - */ - child->dev.class = &pcibus_class; - dev_set_name(&child->dev, "%04x:%02x", pci_domain_nr(child), busnr); - - /* - * Set up the primary, secondary and subordinate - * bus numbers. - */ - child->number = child->secondary = busnr; - child->primary = parent->secondary; - child->subordinate = 0xff; - - if (!bridge) - return child; - - child->self = bridge; - child->bridge = get_device(&bridge->dev); - - /* Set up default resource pointers and names.. */ - for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) { - child->resource[i] = &bridge->resource[PCI_BRIDGE_RESOURCES+i]; - child->resource[i]->name = child->name; - } - bridge->subordinate = child; - - return child; -} - -struct pci_bus *__ref pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr) -{ - struct pci_bus *child; - - child = pci_alloc_child_bus(parent, dev, busnr); - if (child) { - down_write(&pci_bus_sem); - list_add_tail(&child->node, &parent->children); - up_write(&pci_bus_sem); - } - return child; -} - -static void pci_fixup_parent_subordinate_busnr(struct pci_bus *child, int max) -{ - struct pci_bus *parent = child->parent; - -#ifndef DDE_LINUX - /* Attempts to fix that up are really dangerous unless - we're going to re-assign all bus numbers. */ - if (!pcibios_assign_all_busses()) - return; -#endif - - while (parent->parent && parent->subordinate < max) { - parent->subordinate = max; - pci_write_config_byte(parent->self, PCI_SUBORDINATE_BUS, max); - parent = parent->parent; - } -} - -/* - * If it's a bridge, configure it and scan the bus behind it. - * For CardBus bridges, we don't scan behind as the devices will - * be handled by the bridge driver itself. - * - * We need to process bridges in two passes -- first we scan those - * already configured by the BIOS and after we are done with all of - * them, we proceed to assigning numbers to the remaining buses in - * order to avoid overlaps between old and new bus numbers. - */ -int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass) -{ - struct pci_bus *child; - int is_cardbus = (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS); - u32 buses, i, j = 0; - u16 bctl; - int broken = 0; - - pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses); - - dev_dbg(&dev->dev, "scanning behind bridge, config %06x, pass %d\n", - buses & 0xffffff, pass); - - /* Check if setup is sensible at all */ - if (!pass && - ((buses & 0xff) != bus->number || ((buses >> 8) & 0xff) <= bus->number)) { - dev_dbg(&dev->dev, "bus configuration invalid, reconfiguring\n"); - broken = 1; - } - - /* Disable MasterAbortMode during probing to avoid reporting - of bus errors (in some architectures) */ - pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl); - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, - bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT); - - if ((buses & 0xffff00) && !pcibios_assign_all_busses() && !is_cardbus && !broken) { - unsigned int cmax, busnr; - /* - * Bus already configured by firmware, process it in the first - * pass and just note the configuration. - */ - if (pass) - goto out; - busnr = (buses >> 8) & 0xFF; - - /* - * If we already got to this bus through a different bridge, - * ignore it. This can happen with the i450NX chipset. - */ - if (pci_find_bus(pci_domain_nr(bus), busnr)) { - dev_info(&dev->dev, "bus %04x:%02x already known\n", - pci_domain_nr(bus), busnr); - goto out; - } - - child = pci_add_new_bus(bus, dev, busnr); - if (!child) - goto out; - child->primary = buses & 0xFF; - child->subordinate = (buses >> 16) & 0xFF; - child->bridge_ctl = bctl; - - cmax = pci_scan_child_bus(child); - if (cmax > max) - max = cmax; - if (child->subordinate > max) - max = child->subordinate; - } else { -#ifndef DDE_LINUX - /* - * We need to assign a number to this bus which we always - * do in the second pass. - */ - if (!pass) { - if (pcibios_assign_all_busses() || broken) - /* Temporarily disable forwarding of the - configuration cycles on all bridges in - this bus segment to avoid possible - conflicts in the second pass between two - bridges programmed with overlapping - bus ranges. */ - pci_write_config_dword(dev, PCI_PRIMARY_BUS, - buses & ~0xffffff); - goto out; - } -#endif /* DDE_LINUX */ - - /* Clear errors */ - pci_write_config_word(dev, PCI_STATUS, 0xffff); - - /* Prevent assigning a bus number that already exists. - * This can happen when a bridge is hot-plugged */ - if (pci_find_bus(pci_domain_nr(bus), max+1)) - goto out; - child = pci_add_new_bus(bus, dev, ++max); - buses = (buses & 0xff000000) - | ((unsigned int)(child->primary) << 0) - | ((unsigned int)(child->secondary) << 8) - | ((unsigned int)(child->subordinate) << 16); - - /* - * yenta.c forces a secondary latency timer of 176. - * Copy that behaviour here. - */ - if (is_cardbus) { - buses &= ~0xff000000; - buses |= CARDBUS_LATENCY_TIMER << 24; - } - - /* - * We need to blast all three values with a single write. - */ - pci_write_config_dword(dev, PCI_PRIMARY_BUS, buses); - - if (!is_cardbus) { - child->bridge_ctl = bctl; - /* - * Adjust subordinate busnr in parent buses. - * We do this before scanning for children because - * some devices may not be detected if the bios - * was lazy. - */ - pci_fixup_parent_subordinate_busnr(child, max); - /* Now we can scan all subordinate buses... */ - max = pci_scan_child_bus(child); - /* - * now fix it up again since we have found - * the real value of max. - */ - pci_fixup_parent_subordinate_busnr(child, max); - } else { - /* - * For CardBus bridges, we leave 4 bus numbers - * as cards with a PCI-to-PCI bridge can be - * inserted later. - */ - for (i=0; i<CARDBUS_RESERVE_BUSNR; i++) { - struct pci_bus *parent = bus; - if (pci_find_bus(pci_domain_nr(bus), - max+i+1)) - break; - while (parent->parent) { - if ((!pcibios_assign_all_busses()) && - (parent->subordinate > max) && - (parent->subordinate <= max+i)) { - j = 1; - } - parent = parent->parent; - } - if (j) { - /* - * Often, there are two cardbus bridges - * -- try to leave one valid bus number - * for each one. - */ - i /= 2; - break; - } - } - max += i; - pci_fixup_parent_subordinate_busnr(child, max); - } - /* - * Set the subordinate bus number to its real value. - */ - child->subordinate = max; - pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, max); - } - - sprintf(child->name, - (is_cardbus ? "PCI CardBus %04x:%02x" : "PCI Bus %04x:%02x"), - pci_domain_nr(bus), child->number); - - /* Has only triggered on CardBus, fixup is in yenta_socket */ - while (bus->parent) { - if ((child->subordinate > bus->subordinate) || - (child->number > bus->subordinate) || - (child->number < bus->number) || - (child->subordinate < bus->number)) { - pr_debug("PCI: Bus #%02x (-#%02x) is %s " - "hidden behind%s bridge #%02x (-#%02x)\n", - child->number, child->subordinate, - (bus->number > child->subordinate && - bus->subordinate < child->number) ? - "wholly" : "partially", - bus->self->transparent ? " transparent" : "", - bus->number, bus->subordinate); - } - bus = bus->parent; - } - -out: - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl); - - return max; -} - -/* - * Read interrupt line and base address registers. - * The architecture-dependent code can tweak these, of course. - */ -static void pci_read_irq(struct pci_dev *dev) -{ - unsigned char irq; - - pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &irq); - dev->pin = irq; - if (irq) - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq); - dev->irq = irq; -} - -#define LEGACY_IO_RESOURCE (IORESOURCE_IO | IORESOURCE_PCI_FIXED) - -/** - * pci_setup_device - fill in class and map information of a device - * @dev: the device structure to fill - * - * Initialize the device structure with information about the device's - * vendor,class,memory and IO-space addresses,IRQ lines etc. - * Called at initialisation of the PCI subsystem and by CardBus services. - * Returns 0 on success and -1 if unknown type of device (not normal, bridge - * or CardBus). - */ -static int pci_setup_device(struct pci_dev * dev) -{ - u32 class; - - dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(dev->bus), - dev->bus->number, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn)); - - pci_read_config_dword(dev, PCI_CLASS_REVISION, &class); - dev->revision = class & 0xff; - class >>= 8; /* upper 3 bytes */ - dev->class = class; - class >>= 8; - - dev_dbg(&dev->dev, "found [%04x:%04x] class %06x header type %02x\n", - dev->vendor, dev->device, class, dev->hdr_type); - - /* "Unknown power state" */ - dev->current_state = PCI_UNKNOWN; - - /* Early fixups, before probing the BARs */ - pci_fixup_device(pci_fixup_early, dev); - class = dev->class >> 8; - - switch (dev->hdr_type) { /* header type */ - case PCI_HEADER_TYPE_NORMAL: /* standard header */ - if (class == PCI_CLASS_BRIDGE_PCI) - goto bad; - pci_read_irq(dev); - pci_read_bases(dev, 6, PCI_ROM_ADDRESS); - pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor); - pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device); - - /* - * Do the ugly legacy mode stuff here rather than broken chip - * quirk code. Legacy mode ATA controllers have fixed - * addresses. These are not always echoed in BAR0-3, and - * BAR0-3 in a few cases contain junk! - */ - if (class == PCI_CLASS_STORAGE_IDE) { - u8 progif; - pci_read_config_byte(dev, PCI_CLASS_PROG, &progif); - if ((progif & 1) == 0) { - dev->resource[0].start = 0x1F0; - dev->resource[0].end = 0x1F7; - dev->resource[0].flags = LEGACY_IO_RESOURCE; - dev->resource[1].start = 0x3F6; - dev->resource[1].end = 0x3F6; - dev->resource[1].flags = LEGACY_IO_RESOURCE; - } - if ((progif & 4) == 0) { - dev->resource[2].start = 0x170; - dev->resource[2].end = 0x177; - dev->resource[2].flags = LEGACY_IO_RESOURCE; - dev->resource[3].start = 0x376; - dev->resource[3].end = 0x376; - dev->resource[3].flags = LEGACY_IO_RESOURCE; - } - } - break; - - case PCI_HEADER_TYPE_BRIDGE: /* bridge header */ - if (class != PCI_CLASS_BRIDGE_PCI) - goto bad; - /* The PCI-to-PCI bridge spec requires that subtractive - decoding (i.e. transparent) bridge must have programming - interface code of 0x01. */ - pci_read_irq(dev); - dev->transparent = ((dev->class & 0xff) == 1); - pci_read_bases(dev, 2, PCI_ROM_ADDRESS1); - break; - - case PCI_HEADER_TYPE_CARDBUS: /* CardBus bridge header */ - if (class != PCI_CLASS_BRIDGE_CARDBUS) - goto bad; - pci_read_irq(dev); - pci_read_bases(dev, 1, 0); - pci_read_config_word(dev, PCI_CB_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor); - pci_read_config_word(dev, PCI_CB_SUBSYSTEM_ID, &dev->subsystem_device); - break; - - default: /* unknown header */ - dev_err(&dev->dev, "unknown header type %02x, " - "ignoring device\n", dev->hdr_type); - return -1; - - bad: - dev_err(&dev->dev, "ignoring class %02x (doesn't match header " - "type %02x)\n", class, dev->hdr_type); - dev->class = PCI_CLASS_NOT_DEFINED; - } - - /* We found a fine healthy device, go go go... */ - return 0; -} - -static void pci_release_capabilities(struct pci_dev *dev) -{ - pci_vpd_release(dev); -} - -/** - * pci_release_dev - free a pci device structure when all users of it are finished. - * @dev: device that's been disconnected - * - * Will be called only by the device core when all users of this pci device are - * done. - */ -static void pci_release_dev(struct device *dev) -{ - struct pci_dev *pci_dev; - - pci_dev = to_pci_dev(dev); - pci_release_capabilities(pci_dev); - kfree(pci_dev); -} - -static void set_pcie_port_type(struct pci_dev *pdev) -{ - int pos; - u16 reg16; - - pos = pci_find_capability(pdev, PCI_CAP_ID_EXP); - if (!pos) - return; - pdev->is_pcie = 1; - pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, ®16); - pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4; -} - -/** - * pci_cfg_space_size - get the configuration space size of the PCI device. - * @dev: PCI device - * - * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express devices - * have 4096 bytes. Even if the device is capable, that doesn't mean we can - * access it. Maybe we don't have a way to generate extended config space - * accesses, or the device is behind a reverse Express bridge. So we try - * reading the dword at 0x100 which must either be 0 or a valid extended - * capability header. - */ -int pci_cfg_space_size_ext(struct pci_dev *dev) -{ - u32 status; - int pos = PCI_CFG_SPACE_SIZE; - - if (pci_read_config_dword(dev, pos, &status) != PCIBIOS_SUCCESSFUL) - goto fail; - if (status == 0xffffffff) - goto fail; - - return PCI_CFG_SPACE_EXP_SIZE; - - fail: - return PCI_CFG_SPACE_SIZE; -} - -int pci_cfg_space_size(struct pci_dev *dev) -{ - int pos; - u32 status; - - pos = pci_find_capability(dev, PCI_CAP_ID_EXP); - if (!pos) { - pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); - if (!pos) - goto fail; - - pci_read_config_dword(dev, pos + PCI_X_STATUS, &status); - if (!(status & (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ))) - goto fail; - } - - return pci_cfg_space_size_ext(dev); - - fail: - return PCI_CFG_SPACE_SIZE; -} - -static void pci_release_bus_bridge_dev(struct device *dev) -{ - kfree(dev); -} - -struct pci_dev *alloc_pci_dev(void) -{ - struct pci_dev *dev; - - dev = kzalloc(sizeof(struct pci_dev), GFP_KERNEL); - if (!dev) - return NULL; - - INIT_LIST_HEAD(&dev->bus_list); - - return dev; -} -EXPORT_SYMBOL(alloc_pci_dev); - -/* - * Read the config data for a PCI device, sanity-check it - * and fill in the dev structure... - */ -static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) -{ - struct pci_dev *dev; - struct pci_slot *slot; - u32 l; - u8 hdr_type; - int delay = 1; - - if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, &l)) - return NULL; - - /* some broken boards return 0 or ~0 if a slot is empty: */ - if (l == 0xffffffff || l == 0x00000000 || - l == 0x0000ffff || l == 0xffff0000) - return NULL; - - /* Configuration request Retry Status */ - while (l == 0xffff0001) { - msleep(delay); - delay *= 2; - if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, &l)) - return NULL; - /* Card hasn't responded in 60 seconds? Must be stuck. */ - if (delay > 60 * 1000) { - printk(KERN_WARNING "pci %04x:%02x:%02x.%d: not " - "responding\n", pci_domain_nr(bus), - bus->number, PCI_SLOT(devfn), - PCI_FUNC(devfn)); - return NULL; - } - } - - if (pci_bus_read_config_byte(bus, devfn, PCI_HEADER_TYPE, &hdr_type)) - return NULL; - - dev = alloc_pci_dev(); - if (!dev) - return NULL; - - dev->bus = bus; - dev->sysdata = bus->sysdata; - dev->dev.parent = bus->bridge; - dev->dev.bus = &pci_bus_type; - dev->devfn = devfn; - dev->hdr_type = hdr_type & 0x7f; - dev->multifunction = !!(hdr_type & 0x80); - dev->vendor = l & 0xffff; - dev->device = (l >> 16) & 0xffff; - dev->cfg_size = pci_cfg_space_size(dev); - dev->error_state = pci_channel_io_normal; - set_pcie_port_type(dev); - - list_for_each_entry(slot, &bus->slots, list) - if (PCI_SLOT(devfn) == slot->number) - dev->slot = slot; - - /* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer) - set this higher, assuming the system even supports it. */ - dev->dma_mask = 0xffffffff; - if (pci_setup_device(dev) < 0) { - kfree(dev); - return NULL; - } - - return dev; -} - -static void pci_init_capabilities(struct pci_dev *dev) -{ - /* MSI/MSI-X list */ - pci_msi_init_pci_dev(dev); - - /* Buffers for saving PCIe and PCI-X capabilities */ - pci_allocate_cap_save_buffers(dev); - - /* Power Management */ - pci_pm_init(dev); - platform_pci_wakeup_init(dev); - - /* Vital Product Data */ - pci_vpd_pci22_init(dev); - - /* Alternative Routing-ID Forwarding */ - pci_enable_ari(dev); -} - -void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) -{ - device_initialize(&dev->dev); - dev->dev.release = pci_release_dev; - pci_dev_get(dev); - - dev->dev.dma_mask = &dev->dma_mask; - dev->dev.dma_parms = &dev->dma_parms; - dev->dev.coherent_dma_mask = 0xffffffffull; - - pci_set_dma_max_seg_size(dev, 65536); - pci_set_dma_seg_boundary(dev, 0xffffffff); - - /* Fix up broken headers */ - pci_fixup_device(pci_fixup_header, dev); - - /* Initialize various capabilities */ - pci_init_capabilities(dev); - - /* - * Add the device to our list of discovered devices - * and the bus list for fixup functions, etc. - */ - down_write(&pci_bus_sem); - list_add_tail(&dev->bus_list, &bus->devices); - up_write(&pci_bus_sem); -} - -struct pci_dev *__ref pci_scan_single_device(struct pci_bus *bus, int devfn) -{ - struct pci_dev *dev; - - dev = pci_scan_device(bus, devfn); - if (!dev) - return NULL; - - pci_device_add(dev, bus); - - return dev; -} -EXPORT_SYMBOL(pci_scan_single_device); - -/** - * pci_scan_slot - scan a PCI slot on a bus for devices. - * @bus: PCI bus to scan - * @devfn: slot number to scan (must have zero function.) - * - * Scan a PCI slot on the specified PCI bus for devices, adding - * discovered devices to the @bus->devices list. New devices - * will not have is_added set. - */ -int pci_scan_slot(struct pci_bus *bus, int devfn) -{ - int func, nr = 0; - int scan_all_fns; - - scan_all_fns = pcibios_scan_all_fns(bus, devfn); - - for (func = 0; func < 8; func++, devfn++) { - struct pci_dev *dev; - - dev = pci_scan_single_device(bus, devfn); - if (dev) { - nr++; - - /* - * If this is a single function device, - * don't scan past the first function. - */ - if (!dev->multifunction) { - if (func > 0) { - dev->multifunction = 1; - } else { - break; - } - } - } else { - if (func == 0 && !scan_all_fns) - break; - } - } - - /* only one slot has pcie device */ - if (bus->self && nr) - pcie_aspm_init_link_state(bus->self); - - return nr; -} - -unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus) -{ - unsigned int devfn, pass, max = bus->secondary; - struct pci_dev *dev; - - pr_debug("PCI: Scanning bus %04x:%02x\n", pci_domain_nr(bus), bus->number); - - /* Go find them, Rover! */ - for (devfn = 0; devfn < 0x100; devfn += 8) - pci_scan_slot(bus, devfn); - -#ifndef DDE_LINUX - /* - * After performing arch-dependent fixup of the bus, look behind - * all PCI-to-PCI bridges on this bus. - */ - pr_debug("PCI: Fixups for bus %04x:%02x\n", pci_domain_nr(bus), bus->number); - pcibios_fixup_bus(bus); - for (pass=0; pass < 2; pass++) - list_for_each_entry(dev, &bus->devices, bus_list) { - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || - dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) - max = pci_scan_bridge(bus, dev, max, pass); - } -#endif - - /* - * We've scanned the bus and so we know all about what's on - * the other side of any bridges that may be on this bus plus - * any devices. - * - * Return how far we've got finding sub-buses. - */ - pr_debug("PCI: Bus scan for %04x:%02x returning with max=%02x\n", - pci_domain_nr(bus), bus->number, max); - return max; -} - -void __attribute__((weak)) set_pci_bus_resources_arch_default(struct pci_bus *b) -{ -} - -struct pci_bus * pci_create_bus(struct device *parent, - int bus, struct pci_ops *ops, void *sysdata) -{ - int error; - struct pci_bus *b; - struct device *dev; - - b = pci_alloc_bus(); - if (!b) - return NULL; - - dev = kmalloc(sizeof(*dev), GFP_KERNEL); - if (!dev){ - kfree(b); - return NULL; - } - - b->sysdata = sysdata; - b->ops = ops; - - if (pci_find_bus(pci_domain_nr(b), bus)) { - /* If we already got to this bus through a different bridge, ignore it */ - pr_debug("PCI: Bus %04x:%02x already known\n", pci_domain_nr(b), bus); - goto err_out; - } - - down_write(&pci_bus_sem); - list_add_tail(&b->node, &pci_root_buses); - up_write(&pci_bus_sem); - - memset(dev, 0, sizeof(*dev)); - dev->parent = parent; - dev->release = pci_release_bus_bridge_dev; - dev_set_name(dev, "pci%04x:%02x", pci_domain_nr(b), bus); - error = device_register(dev); - if (error) - goto dev_reg_err; - b->bridge = get_device(dev); - - if (!parent) - set_dev_node(b->bridge, pcibus_to_node(b)); - - b->dev.class = &pcibus_class; - b->dev.parent = b->bridge; - dev_set_name(&b->dev, "%04x:%02x", pci_domain_nr(b), bus); - error = device_register(&b->dev); - if (error) - goto class_dev_reg_err; - error = device_create_file(&b->dev, &dev_attr_cpuaffinity); - if (error) - goto dev_create_file_err; - - /* Create legacy_io and legacy_mem files for this bus */ - pci_create_legacy_files(b); - - b->number = b->secondary = bus; - b->resource[0] = &ioport_resource; - b->resource[1] = &iomem_resource; - - set_pci_bus_resources_arch_default(b); - - return b; - -dev_create_file_err: - device_unregister(&b->dev); -class_dev_reg_err: - device_unregister(dev); -dev_reg_err: - down_write(&pci_bus_sem); - list_del(&b->node); - up_write(&pci_bus_sem); -err_out: - kfree(dev); - kfree(b); - return NULL; -} - -struct pci_bus * __devinit pci_scan_bus_parented(struct device *parent, - int bus, struct pci_ops *ops, void *sysdata) -{ - struct pci_bus *b; - - b = pci_create_bus(parent, bus, ops, sysdata); - if (b) - b->subordinate = pci_scan_child_bus(b); - return b; -} -EXPORT_SYMBOL(pci_scan_bus_parented); - -#ifdef CONFIG_HOTPLUG -EXPORT_SYMBOL(pci_add_new_bus); -EXPORT_SYMBOL(pci_scan_slot); -EXPORT_SYMBOL(pci_scan_bridge); -EXPORT_SYMBOL_GPL(pci_scan_child_bus); -#endif - -static int __init pci_sort_bf_cmp(const struct device *d_a, const struct device *d_b) -{ - const struct pci_dev *a = to_pci_dev(d_a); - const struct pci_dev *b = to_pci_dev(d_b); - - if (pci_domain_nr(a->bus) < pci_domain_nr(b->bus)) return -1; - else if (pci_domain_nr(a->bus) > pci_domain_nr(b->bus)) return 1; - - if (a->bus->number < b->bus->number) return -1; - else if (a->bus->number > b->bus->number) return 1; - - if (a->devfn < b->devfn) return -1; - else if (a->devfn > b->devfn) return 1; - - return 0; -} - -void __init pci_sort_breadthfirst(void) -{ - bus_sort_breadthfirst(&pci_bus_type, &pci_sort_bf_cmp); -} diff --git a/libdde_linux26/lib/src/fs/.svn/all-wcprops b/libdde_linux26/lib/src/fs/.svn/all-wcprops deleted file mode 100644 index b87b0b35..00000000 --- a/libdde_linux26/lib/src/fs/.svn/all-wcprops +++ /dev/null @@ -1,23 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 61 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/fs -END -block_dev.c -K 25 -svn:wc:ra_dav:version-url -V 73 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/fs/block_dev.c -END -buffer.c -K 25 -svn:wc:ra_dav:version-url -V 70 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/fs/buffer.c -END -char_dev.c -K 25 -svn:wc:ra_dav:version-url -V 72 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/fs/char_dev.c -END diff --git a/libdde_linux26/lib/src/fs/.svn/entries b/libdde_linux26/lib/src/fs/.svn/entries deleted file mode 100644 index fa781e75..00000000 --- a/libdde_linux26/lib/src/fs/.svn/entries +++ /dev/null @@ -1,130 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/fs -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -block_dev.c -file - - - - -2009-11-15T17:17:10.000000Z -eb568fcd29a19e618484d4b5543c680f -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -33946 - -buffer.c -file - - - - -2009-11-15T17:17:10.000000Z -4d9f46822ca7a0a24129334ac2d50dd2 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -92122 - -char_dev.c -file - - - - -2009-11-15T17:17:10.000000Z -7dd8da71bba451311d6d91135ae21bf2 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -13672 - diff --git a/libdde_linux26/lib/src/fs/.svn/format b/libdde_linux26/lib/src/fs/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/fs/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/fs/.svn/text-base/block_dev.c.svn-base b/libdde_linux26/lib/src/fs/.svn/text-base/block_dev.c.svn-base deleted file mode 100644 index 4c4c2f64..00000000 --- a/libdde_linux26/lib/src/fs/.svn/text-base/block_dev.c.svn-base +++ /dev/null @@ -1,1422 +0,0 @@ -/* - * linux/fs/block_dev.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE - */ - -#include <linux/init.h> -#include <linux/mm.h> -#include <linux/fcntl.h> -#include <linux/slab.h> -#include <linux/kmod.h> -#include <linux/major.h> -#include <linux/smp_lock.h> -#include <linux/device_cgroup.h> -#include <linux/highmem.h> -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/blkpg.h> -#include <linux/buffer_head.h> -#include <linux/writeback.h> -#include <linux/mpage.h> -#include <linux/mount.h> -#include <linux/uio.h> -#include <linux/namei.h> -#include <linux/log2.h> -#include <asm/uaccess.h> -#include "internal.h" - -#ifdef DDE_LINUX -#include "local.h" -#endif - -struct bdev_inode { - struct block_device bdev; - struct inode vfs_inode; -}; - -static const struct address_space_operations def_blk_aops; - -static inline struct bdev_inode *BDEV_I(struct inode *inode) -{ - return container_of(inode, struct bdev_inode, vfs_inode); -} - -inline struct block_device *I_BDEV(struct inode *inode) -{ - return &BDEV_I(inode)->bdev; -} - -EXPORT_SYMBOL(I_BDEV); - -static sector_t max_block(struct block_device *bdev) -{ - sector_t retval = ~((sector_t)0); - loff_t sz = i_size_read(bdev->bd_inode); - - if (sz) { - unsigned int size = block_size(bdev); - unsigned int sizebits = blksize_bits(size); - retval = (sz >> sizebits); - } - return retval; -} - -/* Kill _all_ buffers and pagecache , dirty or not.. */ -static void kill_bdev(struct block_device *bdev) -{ - if (bdev->bd_inode->i_mapping->nrpages == 0) - return; - invalidate_bh_lrus(); - truncate_inode_pages(bdev->bd_inode->i_mapping, 0); -} - -int set_blocksize(struct block_device *bdev, int size) -{ - /* Size must be a power of two, and between 512 and PAGE_SIZE */ - if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) - return -EINVAL; - - /* Size cannot be smaller than the size supported by the device */ - if (size < bdev_hardsect_size(bdev)) - return -EINVAL; - - /* Don't change the size if it is same as current */ - if (bdev->bd_block_size != size) { - sync_blockdev(bdev); - bdev->bd_block_size = size; - bdev->bd_inode->i_blkbits = blksize_bits(size); - kill_bdev(bdev); - } - return 0; -} - -EXPORT_SYMBOL(set_blocksize); - -int sb_set_blocksize(struct super_block *sb, int size) -{ - if (set_blocksize(sb->s_bdev, size)) - return 0; - /* If we get here, we know size is power of two - * and it's value is between 512 and PAGE_SIZE */ - sb->s_blocksize = size; - sb->s_blocksize_bits = blksize_bits(size); - return sb->s_blocksize; -} - -EXPORT_SYMBOL(sb_set_blocksize); - -int sb_min_blocksize(struct super_block *sb, int size) -{ - int minsize = bdev_hardsect_size(sb->s_bdev); - if (size < minsize) - size = minsize; - return sb_set_blocksize(sb, size); -} - -EXPORT_SYMBOL(sb_min_blocksize); - -static int -blkdev_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - if (iblock >= max_block(I_BDEV(inode))) { - if (create) - return -EIO; - - /* - * for reads, we're just trying to fill a partial page. - * return a hole, they will have to call get_block again - * before they can fill it, and they will get -EIO at that - * time - */ - return 0; - } - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - set_buffer_mapped(bh); - return 0; -} - -static int -blkdev_get_blocks(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - sector_t end_block = max_block(I_BDEV(inode)); - unsigned long max_blocks = bh->b_size >> inode->i_blkbits; - - if ((iblock + max_blocks) > end_block) { - max_blocks = end_block - iblock; - if ((long)max_blocks <= 0) { - if (create) - return -EIO; /* write fully beyond EOF */ - /* - * It is a read which is fully beyond EOF. We return - * a !buffer_mapped buffer - */ - max_blocks = 0; - } - } - - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - bh->b_size = max_blocks << inode->i_blkbits; - if (max_blocks) - set_buffer_mapped(bh); - return 0; -} - -static ssize_t -blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - -#ifndef DDE_LINUX - return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), - iov, offset, nr_segs, blkdev_get_blocks, NULL); -#else - WARN_UNIMPL; - return 0; -#endif /* DDE_LINUX */ -} - -static int blkdev_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, blkdev_get_block, wbc); -} - -static int blkdev_readpage(struct file * file, struct page * page) -{ - return block_read_full_page(page, blkdev_get_block); -} - -static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - blkdev_get_block); -} - -static int blkdev_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - int ret; - ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - unlock_page(page); - page_cache_release(page); - - return ret; -} - -/* - * private llseek: - * for a block special file file->f_path.dentry->d_inode->i_size is zero - * so we compute the size by hand (just as in block_read/write above) - */ -static loff_t block_llseek(struct file *file, loff_t offset, int origin) -{ - struct inode *bd_inode = file->f_mapping->host; - loff_t size; - loff_t retval; - - mutex_lock(&bd_inode->i_mutex); - size = i_size_read(bd_inode); - - switch (origin) { - case 2: - offset += size; - break; - case 1: - offset += file->f_pos; - } - retval = -EINVAL; - if (offset >= 0 && offset <= size) { - if (offset != file->f_pos) { - file->f_pos = offset; - } - retval = offset; - } - mutex_unlock(&bd_inode->i_mutex); - return retval; -} - -/* - * Filp is never NULL; the only case when ->fsync() is called with - * NULL first argument is nfsd_sync_dir() and that's not a directory. - */ - -static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) -{ - return sync_blockdev(I_BDEV(filp->f_mapping->host)); -} - -/* - * pseudo-fs - */ - -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); -static struct kmem_cache * bdev_cachep __read_mostly; - -static struct inode *bdev_alloc_inode(struct super_block *sb) -{ - struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); - if (!ei) - return NULL; - return &ei->vfs_inode; -} - -static void bdev_destroy_inode(struct inode *inode) -{ - struct bdev_inode *bdi = BDEV_I(inode); - - bdi->bdev.bd_inode_backing_dev_info = NULL; - kmem_cache_free(bdev_cachep, bdi); -} - -static void init_once(void *foo) -{ - struct bdev_inode *ei = (struct bdev_inode *) foo; - struct block_device *bdev = &ei->bdev; - - memset(bdev, 0, sizeof(*bdev)); - mutex_init(&bdev->bd_mutex); - sema_init(&bdev->bd_mount_sem, 1); - INIT_LIST_HEAD(&bdev->bd_inodes); - INIT_LIST_HEAD(&bdev->bd_list); -#ifdef CONFIG_SYSFS - INIT_LIST_HEAD(&bdev->bd_holder_list); -#endif - inode_init_once(&ei->vfs_inode); - /* Initialize mutex for freeze. */ - mutex_init(&bdev->bd_fsfreeze_mutex); -} - -static inline void __bd_forget(struct inode *inode) -{ - list_del_init(&inode->i_devices); - inode->i_bdev = NULL; - inode->i_mapping = &inode->i_data; -} - -static void bdev_clear_inode(struct inode *inode) -{ - struct block_device *bdev = &BDEV_I(inode)->bdev; - struct list_head *p; - spin_lock(&bdev_lock); - while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { - __bd_forget(list_entry(p, struct inode, i_devices)); - } - list_del_init(&bdev->bd_list); - spin_unlock(&bdev_lock); -} - -static const struct super_operations bdev_sops = { - .statfs = simple_statfs, - .alloc_inode = bdev_alloc_inode, - .destroy_inode = bdev_destroy_inode, - .drop_inode = generic_delete_inode, - .clear_inode = bdev_clear_inode, -}; - -static int bd_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) -{ - return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); -} - -static struct file_system_type bd_type = { - .name = "bdev", - .get_sb = bd_get_sb, - .kill_sb = kill_anon_super, -}; - -struct super_block *blockdev_superblock __read_mostly; - -void __init bdev_cache_init(void) -{ - int err; - struct vfsmount *bd_mnt; - - bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), - 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), - init_once); - err = register_filesystem(&bd_type); - if (err) - panic("Cannot register bdev pseudo-fs"); - bd_mnt = kern_mount(&bd_type); - if (IS_ERR(bd_mnt)) - panic("Cannot create bdev pseudo-fs"); - blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ -} - -/* - * Most likely _very_ bad one - but then it's hardly critical for small - * /dev and can be fixed when somebody will need really large one. - * Keep in mind that it will be fed through icache hash function too. - */ -static inline unsigned long hash(dev_t dev) -{ - return MAJOR(dev)+MINOR(dev); -} - -static int bdev_test(struct inode *inode, void *data) -{ - return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; -} - -static int bdev_set(struct inode *inode, void *data) -{ - BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; - return 0; -} - -static LIST_HEAD(all_bdevs); - -struct block_device *bdget(dev_t dev) -{ - struct block_device *bdev; - struct inode *inode; - - printk_all_partitions(); - - inode = iget5_locked(blockdev_superblock, hash(dev), - bdev_test, bdev_set, &dev); - - if (!inode) - return NULL; - - bdev = &BDEV_I(inode)->bdev; - - if (inode->i_state & I_NEW) { - bdev->bd_contains = NULL; - bdev->bd_inode = inode; - bdev->bd_block_size = (1 << inode->i_blkbits); - bdev->bd_part_count = 0; - bdev->bd_invalidated = 0; - inode->i_mode = S_IFBLK; - inode->i_rdev = dev; - inode->i_bdev = bdev; - inode->i_data.a_ops = &def_blk_aops; - mapping_set_gfp_mask(&inode->i_data, GFP_USER); - inode->i_data.backing_dev_info = &default_backing_dev_info; - spin_lock(&bdev_lock); - list_add(&bdev->bd_list, &all_bdevs); - spin_unlock(&bdev_lock); - unlock_new_inode(inode); - } - return bdev; -} - -EXPORT_SYMBOL(bdget); - -long nr_blockdev_pages(void) -{ - struct block_device *bdev; - long ret = 0; - spin_lock(&bdev_lock); - list_for_each_entry(bdev, &all_bdevs, bd_list) { - ret += bdev->bd_inode->i_mapping->nrpages; - } - spin_unlock(&bdev_lock); - return ret; -} - -void bdput(struct block_device *bdev) -{ - iput(bdev->bd_inode); -} - -EXPORT_SYMBOL(bdput); - -static struct block_device *bd_acquire(struct inode *inode) -{ - struct block_device *bdev; - - spin_lock(&bdev_lock); - bdev = inode->i_bdev; - if (bdev) { - atomic_inc(&bdev->bd_inode->i_count); - spin_unlock(&bdev_lock); - return bdev; - } - spin_unlock(&bdev_lock); - - bdev = bdget(inode->i_rdev); - if (bdev) { - spin_lock(&bdev_lock); - if (!inode->i_bdev) { - /* - * We take an additional bd_inode->i_count for inode, - * and it's released in clear_inode() of inode. - * So, we can access it via ->i_mapping always - * without igrab(). - */ - atomic_inc(&bdev->bd_inode->i_count); - inode->i_bdev = bdev; - inode->i_mapping = bdev->bd_inode->i_mapping; - list_add(&inode->i_devices, &bdev->bd_inodes); - } - spin_unlock(&bdev_lock); - } - return bdev; -} - -/* Call when you free inode */ - -void bd_forget(struct inode *inode) -{ - struct block_device *bdev = NULL; - - spin_lock(&bdev_lock); - if (inode->i_bdev) { - if (!sb_is_blkdev_sb(inode->i_sb)) - bdev = inode->i_bdev; - __bd_forget(inode); - } - spin_unlock(&bdev_lock); - - if (bdev) - iput(bdev->bd_inode); -} - -int bd_claim(struct block_device *bdev, void *holder) -{ - int res; - spin_lock(&bdev_lock); - - /* first decide result */ - if (bdev->bd_holder == holder) - res = 0; /* already a holder */ - else if (bdev->bd_holder != NULL) - res = -EBUSY; /* held by someone else */ - else if (bdev->bd_contains == bdev) - res = 0; /* is a whole device which isn't held */ - - else if (bdev->bd_contains->bd_holder == bd_claim) - res = 0; /* is a partition of a device that is being partitioned */ - else if (bdev->bd_contains->bd_holder != NULL) - res = -EBUSY; /* is a partition of a held device */ - else - res = 0; /* is a partition of an un-held device */ - - /* now impose change */ - if (res==0) { - /* note that for a whole device bd_holders - * will be incremented twice, and bd_holder will - * be set to bd_claim before being set to holder - */ - bdev->bd_contains->bd_holders ++; - bdev->bd_contains->bd_holder = bd_claim; - bdev->bd_holders++; - bdev->bd_holder = holder; - } - spin_unlock(&bdev_lock); - return res; -} - -EXPORT_SYMBOL(bd_claim); - -void bd_release(struct block_device *bdev) -{ - spin_lock(&bdev_lock); - if (!--bdev->bd_contains->bd_holders) - bdev->bd_contains->bd_holder = NULL; - if (!--bdev->bd_holders) - bdev->bd_holder = NULL; - spin_unlock(&bdev_lock); -} - -EXPORT_SYMBOL(bd_release); - -#ifdef CONFIG_SYSFS -/* - * Functions for bd_claim_by_kobject / bd_release_from_kobject - * - * If a kobject is passed to bd_claim_by_kobject() - * and the kobject has a parent directory, - * following symlinks are created: - * o from the kobject to the claimed bdev - * o from "holders" directory of the bdev to the parent of the kobject - * bd_release_from_kobject() removes these symlinks. - * - * Example: - * If /dev/dm-0 maps to /dev/sda, kobject corresponding to - * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: - * /sys/block/dm-0/slaves/sda --> /sys/block/sda - * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 - */ - -static int add_symlink(struct kobject *from, struct kobject *to) -{ - if (!from || !to) - return 0; - return sysfs_create_link(from, to, kobject_name(to)); -} - -static void del_symlink(struct kobject *from, struct kobject *to) -{ - if (!from || !to) - return; - sysfs_remove_link(from, kobject_name(to)); -} - -/* - * 'struct bd_holder' contains pointers to kobjects symlinked by - * bd_claim_by_kobject. - * It's connected to bd_holder_list which is protected by bdev->bd_sem. - */ -struct bd_holder { - struct list_head list; /* chain of holders of the bdev */ - int count; /* references from the holder */ - struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ - struct kobject *hdev; /* e.g. "/block/dm-0" */ - struct kobject *hdir; /* e.g. "/block/sda/holders" */ - struct kobject *sdev; /* e.g. "/block/sda" */ -}; - -/* - * Get references of related kobjects at once. - * Returns 1 on success. 0 on failure. - * - * Should call bd_holder_release_dirs() after successful use. - */ -static int bd_holder_grab_dirs(struct block_device *bdev, - struct bd_holder *bo) -{ - if (!bdev || !bo) - return 0; - - bo->sdir = kobject_get(bo->sdir); - if (!bo->sdir) - return 0; - - bo->hdev = kobject_get(bo->sdir->parent); - if (!bo->hdev) - goto fail_put_sdir; - - bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); - if (!bo->sdev) - goto fail_put_hdev; - - bo->hdir = kobject_get(bdev->bd_part->holder_dir); - if (!bo->hdir) - goto fail_put_sdev; - - return 1; - -fail_put_sdev: - kobject_put(bo->sdev); -fail_put_hdev: - kobject_put(bo->hdev); -fail_put_sdir: - kobject_put(bo->sdir); - - return 0; -} - -/* Put references of related kobjects at once. */ -static void bd_holder_release_dirs(struct bd_holder *bo) -{ - kobject_put(bo->hdir); - kobject_put(bo->sdev); - kobject_put(bo->hdev); - kobject_put(bo->sdir); -} - -static struct bd_holder *alloc_bd_holder(struct kobject *kobj) -{ - struct bd_holder *bo; - - bo = kzalloc(sizeof(*bo), GFP_KERNEL); - if (!bo) - return NULL; - - bo->count = 1; - bo->sdir = kobj; - - return bo; -} - -static void free_bd_holder(struct bd_holder *bo) -{ - kfree(bo); -} - -/** - * find_bd_holder - find matching struct bd_holder from the block device - * - * @bdev: struct block device to be searched - * @bo: target struct bd_holder - * - * Returns matching entry with @bo in @bdev->bd_holder_list. - * If found, increment the reference count and return the pointer. - * If not found, returns NULL. - */ -static struct bd_holder *find_bd_holder(struct block_device *bdev, - struct bd_holder *bo) -{ - struct bd_holder *tmp; - - list_for_each_entry(tmp, &bdev->bd_holder_list, list) - if (tmp->sdir == bo->sdir) { - tmp->count++; - return tmp; - } - - return NULL; -} - -/** - * add_bd_holder - create sysfs symlinks for bd_claim() relationship - * - * @bdev: block device to be bd_claimed - * @bo: preallocated and initialized by alloc_bd_holder() - * - * Add @bo to @bdev->bd_holder_list, create symlinks. - * - * Returns 0 if symlinks are created. - * Returns -ve if something fails. - */ -static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) -{ - int err; - - if (!bo) - return -EINVAL; - - if (!bd_holder_grab_dirs(bdev, bo)) - return -EBUSY; - - err = add_symlink(bo->sdir, bo->sdev); - if (err) - return err; - - err = add_symlink(bo->hdir, bo->hdev); - if (err) { - del_symlink(bo->sdir, bo->sdev); - return err; - } - - list_add_tail(&bo->list, &bdev->bd_holder_list); - return 0; -} - -/** - * del_bd_holder - delete sysfs symlinks for bd_claim() relationship - * - * @bdev: block device to be bd_claimed - * @kobj: holder's kobject - * - * If there is matching entry with @kobj in @bdev->bd_holder_list - * and no other bd_claim() from the same kobject, - * remove the struct bd_holder from the list, delete symlinks for it. - * - * Returns a pointer to the struct bd_holder when it's removed from the list - * and ready to be freed. - * Returns NULL if matching claim isn't found or there is other bd_claim() - * by the same kobject. - */ -static struct bd_holder *del_bd_holder(struct block_device *bdev, - struct kobject *kobj) -{ - struct bd_holder *bo; - - list_for_each_entry(bo, &bdev->bd_holder_list, list) { - if (bo->sdir == kobj) { - bo->count--; - BUG_ON(bo->count < 0); - if (!bo->count) { - list_del(&bo->list); - del_symlink(bo->sdir, bo->sdev); - del_symlink(bo->hdir, bo->hdev); - bd_holder_release_dirs(bo); - return bo; - } - break; - } - } - - return NULL; -} - -/** - * bd_claim_by_kobject - bd_claim() with additional kobject signature - * - * @bdev: block device to be claimed - * @holder: holder's signature - * @kobj: holder's kobject - * - * Do bd_claim() and if it succeeds, create sysfs symlinks between - * the bdev and the holder's kobject. - * Use bd_release_from_kobject() when relesing the claimed bdev. - * - * Returns 0 on success. (same as bd_claim()) - * Returns errno on failure. - */ -static int bd_claim_by_kobject(struct block_device *bdev, void *holder, - struct kobject *kobj) -{ - int err; - struct bd_holder *bo, *found; - - if (!kobj) - return -EINVAL; - - bo = alloc_bd_holder(kobj); - if (!bo) - return -ENOMEM; - - mutex_lock(&bdev->bd_mutex); - - err = bd_claim(bdev, holder); - if (err) - goto fail; - - found = find_bd_holder(bdev, bo); - if (found) - goto fail; - - err = add_bd_holder(bdev, bo); - if (err) - bd_release(bdev); - else - bo = NULL; -fail: - mutex_unlock(&bdev->bd_mutex); - free_bd_holder(bo); - return err; -} - -/** - * bd_release_from_kobject - bd_release() with additional kobject signature - * - * @bdev: block device to be released - * @kobj: holder's kobject - * - * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). - */ -static void bd_release_from_kobject(struct block_device *bdev, - struct kobject *kobj) -{ - if (!kobj) - return; - - mutex_lock(&bdev->bd_mutex); - bd_release(bdev); - free_bd_holder(del_bd_holder(bdev, kobj)); - mutex_unlock(&bdev->bd_mutex); -} - -/** - * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() - * - * @bdev: block device to be claimed - * @holder: holder's signature - * @disk: holder's gendisk - * - * Call bd_claim_by_kobject() with getting @disk->slave_dir. - */ -int bd_claim_by_disk(struct block_device *bdev, void *holder, - struct gendisk *disk) -{ - return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); -} -EXPORT_SYMBOL_GPL(bd_claim_by_disk); - -/** - * bd_release_from_disk - wrapper function for bd_release_from_kobject() - * - * @bdev: block device to be claimed - * @disk: holder's gendisk - * - * Call bd_release_from_kobject() and put @disk->slave_dir. - */ -void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) -{ - bd_release_from_kobject(bdev, disk->slave_dir); - kobject_put(disk->slave_dir); -} -EXPORT_SYMBOL_GPL(bd_release_from_disk); -#endif - -/* - * Tries to open block device by device number. Use it ONLY if you - * really do not have anything better - i.e. when you are behind a - * truly sucky interface and all you are given is a device number. _Never_ - * to be used for internal purposes. If you ever need it - reconsider - * your API. - */ -struct block_device *open_by_devnum(dev_t dev, fmode_t mode) -{ - struct block_device *bdev = bdget(dev); - int err = -ENOMEM; - if (bdev) - err = blkdev_get(bdev, mode); - return err ? ERR_PTR(err) : bdev; -} - -EXPORT_SYMBOL(open_by_devnum); - -/** - * flush_disk - invalidates all buffer-cache entries on a disk - * - * @bdev: struct block device to be flushed - * - * Invalidates all buffer-cache entries on a disk. It should be called - * when a disk has been changed -- either by a media change or online - * resize. - */ -static void flush_disk(struct block_device *bdev) -{ - if (__invalidate_device(bdev)) { - char name[BDEVNAME_SIZE] = ""; - - if (bdev->bd_disk) - disk_name(bdev->bd_disk, 0, name); - printk(KERN_WARNING "VFS: busy inodes on changed media or " - "resized disk %s\n", name); - } - - if (!bdev->bd_disk) - return; - if (disk_partitionable(bdev->bd_disk)) - bdev->bd_invalidated = 1; -} - -/** - * check_disk_size_change - checks for disk size change and adjusts bdev size. - * @disk: struct gendisk to check - * @bdev: struct bdev to adjust. - * - * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. - */ -void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) -{ - loff_t disk_size, bdev_size; - - disk_size = (loff_t)get_capacity(disk) << 9; - bdev_size = i_size_read(bdev->bd_inode); - if (disk_size != bdev_size) { - char name[BDEVNAME_SIZE]; - - disk_name(disk, 0, name); - printk(KERN_INFO - "%s: detected capacity change from %lld to %lld\n", - name, bdev_size, disk_size); - i_size_write(bdev->bd_inode, disk_size); - flush_disk(bdev); - } -} -EXPORT_SYMBOL(check_disk_size_change); - -/** - * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back - * @disk: struct gendisk to be revalidated - * - * This routine is a wrapper for lower-level driver's revalidate_disk - * call-backs. It is used to do common pre and post operations needed - * for all revalidate_disk operations. - */ -int revalidate_disk(struct gendisk *disk) -{ - struct block_device *bdev; - int ret = 0; - - if (disk->fops->revalidate_disk) - ret = disk->fops->revalidate_disk(disk); - - bdev = bdget_disk(disk, 0); - if (!bdev) - return ret; - - mutex_lock(&bdev->bd_mutex); - check_disk_size_change(disk, bdev); - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); - return ret; -} -EXPORT_SYMBOL(revalidate_disk); - -/* - * This routine checks whether a removable media has been changed, - * and invalidates all buffer-cache-entries in that case. This - * is a relatively slow routine, so we have to try to minimize using - * it. Thus it is called only upon a 'mount' or 'open'. This - * is the best way of combining speed and utility, I think. - * People changing diskettes in the middle of an operation deserve - * to lose :-) - */ -int check_disk_change(struct block_device *bdev) -{ - struct gendisk *disk = bdev->bd_disk; - struct block_device_operations * bdops = disk->fops; - - if (!bdops->media_changed) - return 0; - if (!bdops->media_changed(bdev->bd_disk)) - return 0; - - flush_disk(bdev); - if (bdops->revalidate_disk) - bdops->revalidate_disk(bdev->bd_disk); - return 1; -} - -EXPORT_SYMBOL(check_disk_change); - -void bd_set_size(struct block_device *bdev, loff_t size) -{ - unsigned bsize = bdev_hardsect_size(bdev); - - bdev->bd_inode->i_size = size; - while (bsize < PAGE_CACHE_SIZE) { - if (size & bsize) - break; - bsize <<= 1; - } - bdev->bd_block_size = bsize; - bdev->bd_inode->i_blkbits = blksize_bits(bsize); -} -EXPORT_SYMBOL(bd_set_size); - -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); - -/* - * bd_mutex locking: - * - * mutex_lock(part->bd_mutex) - * mutex_lock_nested(whole->bd_mutex, 1) - */ - -static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) -{ - struct gendisk *disk; - int ret; - int partno; - int perm = 0; - - if (mode & FMODE_READ) - perm |= MAY_READ; - if (mode & FMODE_WRITE) - perm |= MAY_WRITE; - /* - * hooks: /n/, see "layering violations". - */ - ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) { - bdput(bdev); - return ret; - } - - lock_kernel(); - restart: - - ret = -ENXIO; - disk = get_gendisk(bdev->bd_dev, &partno); - if (!disk) - goto out_unlock_kernel; - - mutex_lock_nested(&bdev->bd_mutex, for_part); - if (!bdev->bd_openers) { - bdev->bd_disk = disk; - bdev->bd_contains = bdev; - if (!partno) { - struct backing_dev_info *bdi; - - ret = -ENXIO; - bdev->bd_part = disk_get_part(disk, partno); - if (!bdev->bd_part) - goto out_clear; - - if (disk->fops->open) { - ret = disk->fops->open(bdev, mode); - if (ret == -ERESTARTSYS) { - /* Lost a race with 'disk' being - * deleted, try again. - * See md.c - */ - disk_put_part(bdev->bd_part); - bdev->bd_part = NULL; - module_put(disk->fops->owner); - put_disk(disk); - bdev->bd_disk = NULL; - mutex_unlock(&bdev->bd_mutex); - goto restart; - } - if (ret) - goto out_clear; - } - if (!bdev->bd_openers) { - bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - bdi = &default_backing_dev_info; - bdev->bd_inode->i_data.backing_dev_info = bdi; - } - if (bdev->bd_invalidated) - rescan_partitions(disk, bdev); - } else { - struct block_device *whole; - whole = bdget_disk(disk, 0); - ret = -ENOMEM; - if (!whole) - goto out_clear; - BUG_ON(for_part); - ret = __blkdev_get(whole, mode, 1); - if (ret) - goto out_clear; - bdev->bd_contains = whole; - bdev->bd_inode->i_data.backing_dev_info = - whole->bd_inode->i_data.backing_dev_info; - bdev->bd_part = disk_get_part(disk, partno); - if (!(disk->flags & GENHD_FL_UP) || - !bdev->bd_part || !bdev->bd_part->nr_sects) { - ret = -ENXIO; - goto out_clear; - } - bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - } - } else { - put_disk(disk); - module_put(disk->fops->owner); - disk = NULL; - if (bdev->bd_contains == bdev) { - if (bdev->bd_disk->fops->open) { - ret = bdev->bd_disk->fops->open(bdev, mode); - if (ret) - goto out_unlock_bdev; - } - if (bdev->bd_invalidated) - rescan_partitions(bdev->bd_disk, bdev); - } - } - bdev->bd_openers++; - if (for_part) - bdev->bd_part_count++; - mutex_unlock(&bdev->bd_mutex); - unlock_kernel(); - return 0; - - out_clear: - disk_put_part(bdev->bd_part); - bdev->bd_disk = NULL; - bdev->bd_part = NULL; - bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; - if (bdev != bdev->bd_contains) - __blkdev_put(bdev->bd_contains, mode, 1); - bdev->bd_contains = NULL; - out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); - out_unlock_kernel: - unlock_kernel(); - - if (disk) - module_put(disk->fops->owner); - put_disk(disk); - bdput(bdev); - - return ret; -} - -int blkdev_get(struct block_device *bdev, fmode_t mode) -{ - return __blkdev_get(bdev, mode, 0); -} -EXPORT_SYMBOL(blkdev_get); - -static int blkdev_open(struct inode * inode, struct file * filp) -{ - struct block_device *bdev; - int res; - - /* - * Preserve backwards compatibility and allow large file access - * even if userspace doesn't ask for it explicitly. Some mkfs - * binary needs it. We might want to drop this workaround - * during an unstable branch. - */ - filp->f_flags |= O_LARGEFILE; - - if (filp->f_flags & O_NDELAY) - filp->f_mode |= FMODE_NDELAY; - if (filp->f_flags & O_EXCL) - filp->f_mode |= FMODE_EXCL; - if ((filp->f_flags & O_ACCMODE) == 3) - filp->f_mode |= FMODE_WRITE_IOCTL; - - bdev = bd_acquire(inode); - if (bdev == NULL) - return -ENOMEM; - - filp->f_mapping = bdev->bd_inode->i_mapping; - - res = blkdev_get(bdev, filp->f_mode); - if (res) - return res; - - if (filp->f_mode & FMODE_EXCL) { - res = bd_claim(bdev, filp); - if (res) - goto out_blkdev_put; - } - - return 0; - - out_blkdev_put: - blkdev_put(bdev, filp->f_mode); - return res; -} - -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) -{ - int ret = 0; - struct gendisk *disk = bdev->bd_disk; - struct block_device *victim = NULL; - - mutex_lock_nested(&bdev->bd_mutex, for_part); - lock_kernel(); - if (for_part) - bdev->bd_part_count--; - - if (!--bdev->bd_openers) { - sync_blockdev(bdev); - kill_bdev(bdev); - } - if (bdev->bd_contains == bdev) { - if (disk->fops->release) - ret = disk->fops->release(disk, mode); - } - if (!bdev->bd_openers) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); - disk_put_part(bdev->bd_part); - bdev->bd_part = NULL; - bdev->bd_disk = NULL; - bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; - if (bdev != bdev->bd_contains) - victim = bdev->bd_contains; - bdev->bd_contains = NULL; - } - unlock_kernel(); - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); - if (victim) - __blkdev_put(victim, mode, 1); - return ret; -} - -int blkdev_put(struct block_device *bdev, fmode_t mode) -{ - return __blkdev_put(bdev, mode, 0); -} -EXPORT_SYMBOL(blkdev_put); - -static int blkdev_close(struct inode * inode, struct file * filp) -{ - struct block_device *bdev = I_BDEV(filp->f_mapping->host); - if (bdev->bd_holder == filp) - bd_release(bdev); - return blkdev_put(bdev, filp->f_mode); -} - -static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - struct block_device *bdev = I_BDEV(file->f_mapping->host); - fmode_t mode = file->f_mode; - - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; - - return blkdev_ioctl(bdev, mode, cmd, arg); -} - -/* - * Try to release a page associated with block device when the system - * is under memory pressure. - */ -static int blkdev_releasepage(struct page *page, gfp_t wait) -{ - struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; - - if (super && super->s_op->bdev_try_to_free_page) - return super->s_op->bdev_try_to_free_page(super, page, wait); - - return try_to_free_buffers(page); -} - -static const struct address_space_operations def_blk_aops = { - .readpage = blkdev_readpage, - .writepage = blkdev_writepage, - .sync_page = block_sync_page, - .write_begin = blkdev_write_begin, - .write_end = blkdev_write_end, - .writepages = generic_writepages, - .releasepage = blkdev_releasepage, - .direct_IO = blkdev_direct_IO, -}; - -const struct file_operations def_blk_fops = { - .open = blkdev_open, - .release = blkdev_close, -#ifndef DDE_LINUX - .llseek = block_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write_nolock, - .mmap = generic_file_mmap, - .fsync = block_fsync, - .unlocked_ioctl = block_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = compat_blkdev_ioctl, -#endif - .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, -#endif /* DDE_LINUX */ -}; - -int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) -{ - int res; - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - res = blkdev_ioctl(bdev, 0, cmd, arg); - set_fs(old_fs); - return res; -} - -EXPORT_SYMBOL(ioctl_by_bdev); - -/** - * lookup_bdev - lookup a struct block_device by name - * @pathname: special file representing the block device - * - * Get a reference to the blockdevice at @pathname in the current - * namespace if possible and return it. Return ERR_PTR(error) - * otherwise. - */ -struct block_device *lookup_bdev(const char *pathname) -{ - struct block_device *bdev; - struct inode *inode; - struct path path; - int error; - - if (!pathname || !*pathname) - return ERR_PTR(-EINVAL); - - error = kern_path(pathname, LOOKUP_FOLLOW, &path); - if (error) - return ERR_PTR(error); - - inode = path.dentry->d_inode; - error = -ENOTBLK; - if (!S_ISBLK(inode->i_mode)) - goto fail; - error = -EACCES; - if (path.mnt->mnt_flags & MNT_NODEV) - goto fail; - error = -ENOMEM; - bdev = bd_acquire(inode); - if (!bdev) - goto fail; -out: - path_put(&path); - return bdev; -fail: - bdev = ERR_PTR(error); - goto out; -} -EXPORT_SYMBOL(lookup_bdev); - -/** - * open_bdev_exclusive - open a block device by name and set it up for use - * - * @path: special file representing the block device - * @mode: FMODE_... combination to pass be used - * @holder: owner for exclusion - * - * Open the blockdevice described by the special file at @path, claim it - * for the @holder. - */ -struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) -{ - struct block_device *bdev; - int error = 0; - - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) - return bdev; - - error = blkdev_get(bdev, mode); - if (error) - return ERR_PTR(error); - error = -EACCES; - if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) - goto blkdev_put; - error = bd_claim(bdev, holder); - if (error) - goto blkdev_put; - - return bdev; - -blkdev_put: - blkdev_put(bdev, mode); - return ERR_PTR(error); -} - -EXPORT_SYMBOL(open_bdev_exclusive); - -/** - * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive() - * - * @bdev: blockdevice to close - * @mode: mode, must match that used to open. - * - * This is the counterpart to open_bdev_exclusive(). - */ -void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) -{ - bd_release(bdev); - blkdev_put(bdev, mode); -} - -EXPORT_SYMBOL(close_bdev_exclusive); - -int __invalidate_device(struct block_device *bdev) -{ - struct super_block *sb = get_super(bdev); - int res = 0; - - if (sb) { - /* - * no need to lock the super, get_super holds the - * read mutex so the filesystem cannot go away - * under us (->put_super runs with the write lock - * hold). - */ - shrink_dcache_sb(sb); - res = invalidate_inodes(sb); - drop_super(sb); - } - invalidate_bdev(bdev); - return res; -} -EXPORT_SYMBOL(__invalidate_device); diff --git a/libdde_linux26/lib/src/fs/.svn/text-base/buffer.c.svn-base b/libdde_linux26/lib/src/fs/.svn/text-base/buffer.c.svn-base deleted file mode 100644 index d3b1c445..00000000 --- a/libdde_linux26/lib/src/fs/.svn/text-base/buffer.c.svn-base +++ /dev/null @@ -1,3474 +0,0 @@ -/* - * linux/fs/buffer.c - * - * Copyright (C) 1991, 1992, 2002 Linus Torvalds - */ - -/* - * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 - * - * Removed a lot of unnecessary code and simplified things now that - * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 - * - * Speed up hash, lru, and free list operations. Use gfp() for allocating - * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM - * - * Added 32k buffer block sizes - these are required older ARM systems. - RMK - * - * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> - */ - -#include <linux/kernel.h> -#include <linux/syscalls.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/capability.h> -#include <linux/blkdev.h> -#include <linux/file.h> -#include <linux/quotaops.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/writeback.h> -#include <linux/hash.h> -#include <linux/suspend.h> -#include <linux/buffer_head.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/bio.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/bitops.h> -#include <linux/mpage.h> -#include <linux/bit_spinlock.h> - -static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); - -#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) - -inline void -init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) -{ - bh->b_end_io = handler; - bh->b_private = private; -} - -static int sync_buffer(void *word) -{ - struct block_device *bd; - struct buffer_head *bh - = container_of(word, struct buffer_head, b_state); - - smp_mb(); - bd = bh->b_bdev; - if (bd) - blk_run_address_space(bd->bd_inode->i_mapping); - io_schedule(); - return 0; -} - -void __lock_buffer(struct buffer_head *bh) -{ - wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, - TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(__lock_buffer); - -void unlock_buffer(struct buffer_head *bh) -{ - clear_bit_unlock(BH_Lock, &bh->b_state); - smp_mb__after_clear_bit(); - wake_up_bit(&bh->b_state, BH_Lock); -} - -/* - * Block until a buffer comes unlocked. This doesn't stop it - * from becoming locked again - you have to lock it yourself - * if you want to preserve its state. - */ -void __wait_on_buffer(struct buffer_head * bh) -{ - wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); -} - -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - page_cache_release(page); -} - - -static int quiet_error(struct buffer_head *bh) -{ - if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit()) - return 0; - return 1; -} - - -static void buffer_io_error(struct buffer_head *bh) -{ - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", - bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); -} - -/* - * End-of-IO handler helper function which does not touch the bh after - * unlocking it. - * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but - * a race there is benign: unlock_buffer() only use the bh's address for - * hashing after unlocking the buffer, so it doesn't actually touch the bh - * itself. - */ -static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) -{ - if (uptodate) { - set_buffer_uptodate(bh); - } else { - /* This happens, due to failed READA attempts. */ - clear_buffer_uptodate(bh); - } - unlock_buffer(bh); -} - -/* - * Default synchronous end-of-IO handler.. Just mark it up-to-date and - * unlock the buffer. This is what ll_rw_block uses too. - */ -void end_buffer_read_sync(struct buffer_head *bh, int uptodate) -{ - __end_buffer_read_notouch(bh, uptodate); - put_bh(bh); -} - -void end_buffer_write_sync(struct buffer_head *bh, int uptodate) -{ - char b[BDEVNAME_SIZE]; - - if (uptodate) { - set_buffer_uptodate(bh); - } else { - if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { - buffer_io_error(bh); - printk(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); - } - set_buffer_write_io_error(bh); - clear_buffer_uptodate(bh); - } - unlock_buffer(bh); - put_bh(bh); -} - -/* - * Write out and wait upon all the dirty data associated with a block - * device via its mapping. Does not take the superblock lock. - */ -int sync_blockdev(struct block_device *bdev) -{ -#ifndef DDE_LINUX - int ret = 0; - - if (bdev) - ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); - return ret; -#else - WARN_UNIMPL; - return 0; -#endif /* DDE_LINUX */ -} -EXPORT_SYMBOL(sync_blockdev); - -/* - * Write out and wait upon all dirty data associated with this - * device. Filesystem data as well as the underlying block - * device. Takes the superblock lock. - */ -int fsync_bdev(struct block_device *bdev) -{ -#ifndef DDE_LINUX - struct super_block *sb = get_super(bdev); - if (sb) { - int res = fsync_super(sb); - drop_super(sb); - return res; - } - return sync_blockdev(bdev); -#else - WARN_UNIMPL; - return -1; -#endif -} - -/** - * freeze_bdev -- lock a filesystem and force it into a consistent state - * @bdev: blockdevice to lock - * - * This takes the block device bd_mount_sem to make sure no new mounts - * happen on bdev until thaw_bdev() is called. - * If a superblock is found on this device, we take the s_umount semaphore - * on it to make sure nobody unmounts until the snapshot creation is done. - * The reference counter (bd_fsfreeze_count) guarantees that only the last - * unfreeze process can unfreeze the frozen filesystem actually when multiple - * freeze requests arrive simultaneously. It counts up in freeze_bdev() and - * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze - * actually. - */ -struct super_block *freeze_bdev(struct block_device *bdev) -{ - struct super_block *sb; - int error = 0; - - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - bdev->bd_fsfreeze_count++; - sb = get_super(bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; - } - bdev->bd_fsfreeze_count++; - - down(&bdev->bd_mount_sem); - sb = get_super(bdev); - if (sb && !(sb->s_flags & MS_RDONLY)) { - sb->s_frozen = SB_FREEZE_WRITE; - smp_wmb(); - - __fsync_super(sb); - - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); - - sync_blockdev(sb->s_bdev); - - if (sb->s_op->freeze_fs) { - error = sb->s_op->freeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; - drop_super(sb); - up(&bdev->bd_mount_sem); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } - } - } - - sync_blockdev(bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - - return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ -} -EXPORT_SYMBOL(freeze_bdev); - -/** - * thaw_bdev -- unlock filesystem - * @bdev: blockdevice to unlock - * @sb: associated superblock - * - * Unlocks the filesystem and marks it writeable again after freeze_bdev(). - */ -int thaw_bdev(struct block_device *bdev, struct super_block *sb) -{ - int error = 0; - - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return -EINVAL; - } - - bdev->bd_fsfreeze_count--; - if (bdev->bd_fsfreeze_count > 0) { - if (sb) - drop_super(sb); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; - } - - if (sb) { - BUG_ON(sb->s_bdev != bdev); - if (!(sb->s_flags & MS_RDONLY)) { - if (sb->s_op->unfreeze_fs) { - error = sb->s_op->unfreeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; - bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } - } - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); - } - drop_super(sb); - } - - up(&bdev->bd_mount_sem); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; -} -EXPORT_SYMBOL(thaw_bdev); - -/* - * Various filesystems appear to want __find_get_block to be non-blocking. - * But it's the page lock which protects the buffers. To get around this, - * we get exclusion from try_to_free_buffers with the blockdev mapping's - * private_lock. - * - * Hack idea: for the blockdev mapping, i_bufferlist_lock contention - * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take private_lock. (But if - * private_lock is contended then so is mapping->tree_lock). - */ -static struct buffer_head * -__find_get_block_slow(struct block_device *bdev, sector_t block) -{ - struct inode *bd_inode = bdev->bd_inode; - struct address_space *bd_mapping = bd_inode->i_mapping; - struct buffer_head *ret = NULL; - pgoff_t index; - struct buffer_head *bh; - struct buffer_head *head; - struct page *page; - int all_mapped = 1; - - index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); - page = find_get_page(bd_mapping, index); - if (!page) - goto out; - - spin_lock(&bd_mapping->private_lock); - if (!page_has_buffers(page)) - goto out_unlock; - head = page_buffers(page); - bh = head; - do { - if (bh->b_blocknr == block) { - ret = bh; - get_bh(bh); - goto out_unlock; - } - if (!buffer_mapped(bh)) - all_mapped = 0; - bh = bh->b_this_page; - } while (bh != head); - - /* we might be here because some of the buffers on this page are - * not mapped. This is due to various races between - * file io on the block device and getblk. It gets dealt with - * elsewhere, don't buffer_error if we had some unmapped buffers - */ - if (all_mapped) { - printk("__find_get_block_slow() failed. " - "block=%llu, b_blocknr=%llu\n", - (unsigned long long)block, - (unsigned long long)bh->b_blocknr); - printk("b_state=0x%08lx, b_size=%zu\n", - bh->b_state, bh->b_size); - printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); - } -out_unlock: - spin_unlock(&bd_mapping->private_lock); - page_cache_release(page); -out: - return ret; -} - -/* If invalidate_buffers() will trash dirty buffers, it means some kind - of fs corruption is going on. Trashing dirty data always imply losing - information that was supposed to be just stored on the physical layer - by the user. - - Thus invalidate_buffers in general usage is not allwowed to trash - dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to - be preserved. These buffers are simply skipped. - - We also skip buffers which are still in use. For example this can - happen if a userspace program is reading the block device. - - NOTE: In the case where the user removed a removable-media-disk even if - there's still dirty data not synced on disk (due a bug in the device driver - or due an error of the user), by not destroying the dirty buffers we could - generate corruption also on the next media inserted, thus a parameter is - necessary to handle this case in the most safe way possible (trying - to not corrupt also the new disk inserted with the data belonging to - the old now corrupted disk). Also for the ramdisk the natural thing - to do in order to release the ramdisk memory is to destroy dirty buffers. - - These are two special cases. Normal usage imply the device driver - to issue a sync on the device (without waiting I/O completion) and - then an invalidate_buffers call that doesn't trash dirty buffers. - - For handling cache coherency with the blkdev pagecache the 'update' case - is been introduced. It is needed to re-read from disk any pinned - buffer. NOTE: re-reading from disk is destructive so we can do it only - when we assume nobody is changing the buffercache under our I/O and when - we think the disk contains more recent information than the buffercache. - The update == 1 pass marks the buffers we need to update, the update == 2 - pass does the actual I/O. */ -void invalidate_bdev(struct block_device *bdev) -{ - struct address_space *mapping = bdev->bd_inode->i_mapping; - - if (mapping->nrpages == 0) - return; - -#ifndef DDE_LINUX - invalidate_bh_lrus(); - invalidate_mapping_pages(mapping, 0, -1); -#endif -} - -/* - * Kick pdflush then try to free up some ZONE_NORMAL memory. - */ -static void free_more_memory(void) -{ - struct zone *zone; - int nid; - -#ifndef DDE_LINUX - wakeup_pdflush(1024); - yield(); - - for_each_online_node(nid) { - (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), - gfp_zone(GFP_NOFS), NULL, - &zone); - if (zone) - try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, - GFP_NOFS); - } -#else - WARN_UNIMPL; -#endif -} - -/* - * I/O completion handler for block_read_full_page() - pages - * which come unlocked at the end of I/O. - */ -static void end_buffer_async_read(struct buffer_head *bh, int uptodate) -{ - unsigned long flags; - struct buffer_head *first; - struct buffer_head *tmp; - struct page *page; - int page_uptodate = 1; - - BUG_ON(!buffer_async_read(bh)); - - page = bh->b_page; - if (uptodate) { - set_buffer_uptodate(bh); - } else { - clear_buffer_uptodate(bh); - if (!quiet_error(bh)) - buffer_io_error(bh); - SetPageError(page); - } - - /* - * Be _very_ careful from here on. Bad things can happen if - * two buffer heads end IO at almost the same time and both - * decide that the page is now completely done. - */ - first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); - clear_buffer_async_read(bh); - unlock_buffer(bh); - tmp = bh; - do { - if (!buffer_uptodate(tmp)) - page_uptodate = 0; - if (buffer_async_read(tmp)) { - BUG_ON(!buffer_locked(tmp)); - goto still_busy; - } - tmp = tmp->b_this_page; - } while (tmp != bh); - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); - - /* - * If none of the buffers had errors and they are all - * uptodate then we can set the page uptodate. - */ - if (page_uptodate && !PageError(page)) - SetPageUptodate(page); - unlock_page(page); - return; - -still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); - return; -} - -/* - * Completion handler for block_write_full_page() - pages which are unlocked - * during I/O, and which have PageWriteback cleared upon I/O completion. - */ -static void end_buffer_async_write(struct buffer_head *bh, int uptodate) -{ - char b[BDEVNAME_SIZE]; - unsigned long flags; - struct buffer_head *first; - struct buffer_head *tmp; - struct page *page; - - BUG_ON(!buffer_async_write(bh)); - - page = bh->b_page; - if (uptodate) { - set_buffer_uptodate(bh); - } else { - if (!quiet_error(bh)) { - buffer_io_error(bh); - printk(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); - } - set_bit(AS_EIO, &page->mapping->flags); - set_buffer_write_io_error(bh); - clear_buffer_uptodate(bh); - SetPageError(page); - } - - first = page_buffers(page); - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); - - clear_buffer_async_write(bh); - unlock_buffer(bh); - tmp = bh->b_this_page; - while (tmp != bh) { - if (buffer_async_write(tmp)) { - BUG_ON(!buffer_locked(tmp)); - goto still_busy; - } - tmp = tmp->b_this_page; - } - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); - end_page_writeback(page); - return; - -still_busy: - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); - local_irq_restore(flags); - return; -} - -/* - * If a page's buffers are under async readin (end_buffer_async_read - * completion) then there is a possibility that another thread of - * control could lock one of the buffers after it has completed - * but while some of the other buffers have not completed. This - * locked buffer would confuse end_buffer_async_read() into not unlocking - * the page. So the absence of BH_Async_Read tells end_buffer_async_read() - * that this buffer is not under async I/O. - * - * The page comes unlocked when it has no locked buffer_async buffers - * left. - * - * PageLocked prevents anyone starting new async I/O reads any of - * the buffers. - * - * PageWriteback is used to prevent simultaneous writeout of the same - * page. - * - * PageLocked prevents anyone from starting writeback of a page which is - * under read I/O (PageWriteback is only ever set against a locked page). - */ -static void mark_buffer_async_read(struct buffer_head *bh) -{ - bh->b_end_io = end_buffer_async_read; - set_buffer_async_read(bh); -} - -void mark_buffer_async_write(struct buffer_head *bh) -{ - bh->b_end_io = end_buffer_async_write; - set_buffer_async_write(bh); -} -EXPORT_SYMBOL(mark_buffer_async_write); - - -/* - * fs/buffer.c contains helper functions for buffer-backed address space's - * fsync functions. A common requirement for buffer-based filesystems is - * that certain data from the backing blockdev needs to be written out for - * a successful fsync(). For example, ext2 indirect blocks need to be - * written back and waited upon before fsync() returns. - * - * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), - * inode_has_buffers() and invalidate_inode_buffers() are provided for the - * management of a list of dependent buffers at ->i_mapping->private_list. - * - * Locking is a little subtle: try_to_free_buffers() will remove buffers - * from their controlling inode's queue when they are being freed. But - * try_to_free_buffers() will be operating against the *blockdev* mapping - * at the time, not against the S_ISREG file which depends on those buffers. - * So the locking for private_list is via the private_lock in the address_space - * which backs the buffers. Which is different from the address_space - * against which the buffers are listed. So for a particular address_space, - * mapping->private_lock does *not* protect mapping->private_list! In fact, - * mapping->private_list will always be protected by the backing blockdev's - * ->private_lock. - * - * Which introduces a requirement: all buffers on an address_space's - * ->private_list must be from the same address_space: the blockdev's. - * - * address_spaces which do not place buffers at ->private_list via these - * utility functions are free to use private_lock and private_list for - * whatever they want. The only requirement is that list_empty(private_list) - * be true at clear_inode() time. - * - * FIXME: clear_inode should not call invalidate_inode_buffers(). The - * filesystems should do that. invalidate_inode_buffers() should just go - * BUG_ON(!list_empty). - * - * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should - * take an address_space, not an inode. And it should be called - * mark_buffer_dirty_fsync() to clearly define why those buffers are being - * queued up. - * - * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the - * list if it is already on a list. Because if the buffer is on a list, - * it *must* already be on the right one. If not, the filesystem is being - * silly. This will save a ton of locking. But first we have to ensure - * that buffers are taken *off* the old inode's list when they are freed - * (presumably in truncate). That requires careful auditing of all - * filesystems (do it inside bforget()). It could also be done by bringing - * b_inode back. - */ - -/* - * The buffer's backing address_space's private_lock must be held - */ -static void __remove_assoc_queue(struct buffer_head *bh) -{ - list_del_init(&bh->b_assoc_buffers); - WARN_ON(!bh->b_assoc_map); - if (buffer_write_io_error(bh)) - set_bit(AS_EIO, &bh->b_assoc_map->flags); - bh->b_assoc_map = NULL; -} - -int inode_has_buffers(struct inode *inode) -{ - return !list_empty(&inode->i_data.private_list); -} - -/* - * osync is designed to support O_SYNC io. It waits synchronously for - * all already-submitted IO to complete, but does not queue any new - * writes to the disk. - * - * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as - * you dirty the buffers, and then use osync_inode_buffers to wait for - * completion. Any other dirty buffers which are not yet queued for - * write will not be flushed to disk by the osync. - */ -static int osync_buffers_list(spinlock_t *lock, struct list_head *list) -{ - struct buffer_head *bh; - struct list_head *p; - int err = 0; - - spin_lock(lock); -repeat: - list_for_each_prev(p, list) { - bh = BH_ENTRY(p); - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(lock); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - err = -EIO; - brelse(bh); - spin_lock(lock); - goto repeat; - } - } - spin_unlock(lock); - return err; -} - -/** - * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers - * @mapping: the mapping which wants those buffers written - * - * Starts I/O against the buffers at mapping->private_list, and waits upon - * that I/O. - * - * Basically, this is a convenience function for fsync(). - * @mapping is a file or directory which needs those buffers to be written for - * a successful fsync(). - */ -int sync_mapping_buffers(struct address_space *mapping) -{ - struct address_space *buffer_mapping = mapping->assoc_mapping; - - if (buffer_mapping == NULL || list_empty(&mapping->private_list)) - return 0; - - return fsync_buffers_list(&buffer_mapping->private_lock, - &mapping->private_list); -} -EXPORT_SYMBOL(sync_mapping_buffers); - -/* - * Called when we've recently written block `bblock', and it is known that - * `bblock' was for a buffer_boundary() buffer. This means that the block at - * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's - * dirty, schedule it for IO. So that indirects merge nicely with their data. - */ -void write_boundary_block(struct block_device *bdev, - sector_t bblock, unsigned blocksize) -{ - struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); - if (bh) { - if (buffer_dirty(bh)) - ll_rw_block(WRITE, 1, &bh); - put_bh(bh); - } -} - -void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) -{ - struct address_space *mapping = inode->i_mapping; - struct address_space *buffer_mapping = bh->b_page->mapping; - - mark_buffer_dirty(bh); - if (!mapping->assoc_mapping) { - mapping->assoc_mapping = buffer_mapping; - } else { - BUG_ON(mapping->assoc_mapping != buffer_mapping); - } - if (!bh->b_assoc_map) { - spin_lock(&buffer_mapping->private_lock); - list_move_tail(&bh->b_assoc_buffers, - &mapping->private_list); - bh->b_assoc_map = mapping; - spin_unlock(&buffer_mapping->private_lock); - } -} -EXPORT_SYMBOL(mark_buffer_dirty_inode); - -/* - * Mark the page dirty, and set it dirty in the radix tree, and mark the inode - * dirty. - * - * If warn is true, then emit a warning if the page is not uptodate and has - * not been truncated. - */ -static void __set_page_dirty(struct page *page, - struct address_space *mapping, int warn) -{ - spin_lock_irq(&mapping->tree_lock); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(warn && !PageUptodate(page)); - - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_dirty_inc(current); - task_io_account_write(PAGE_CACHE_SIZE); - } - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - spin_unlock_irq(&mapping->tree_lock); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); -} - -/* - * Add a page to the dirty page list. - * - * It is a sad fact of life that this function is called from several places - * deeply under spinlocking. It may not sleep. - * - * If the page has buffers, the uptodate buffers are set dirty, to preserve - * dirty-state coherency between the page and the buffers. It the page does - * not have buffers then when they are later attached they will all be set - * dirty. - * - * The buffers are dirtied before the page is dirtied. There's a small race - * window in which a writepage caller may see the page cleanness but not the - * buffer dirtiness. That's fine. If this code were to set the page dirty - * before the buffers, a concurrent writepage caller could clear the page dirty - * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean - * page on the dirty page list. - * - * We use private_lock to lock against try_to_free_buffers while using the - * page's buffer list. Also use this to protect against clean buffers being - * added to the page after it was set dirty. - * - * FIXME: may need to call ->reservepage here as well. That's rather up to the - * address_space though. - */ -int __set_page_dirty_buffers(struct page *page) -{ - int newly_dirty; - struct address_space *mapping = page_mapping(page); - - if (unlikely(!mapping)) - return !TestSetPageDirty(page); - - spin_lock(&mapping->private_lock); - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); - struct buffer_head *bh = head; - - do { - set_buffer_dirty(bh); - bh = bh->b_this_page; - } while (bh != head); - } - newly_dirty = !TestSetPageDirty(page); - spin_unlock(&mapping->private_lock); - - if (newly_dirty) - __set_page_dirty(page, mapping, 1); - return newly_dirty; -} -EXPORT_SYMBOL(__set_page_dirty_buffers); - -/* - * Write out and wait upon a list of buffers. - * - * We have conflicting pressures: we want to make sure that all - * initially dirty buffers get waited on, but that any subsequently - * dirtied buffers don't. After all, we don't want fsync to last - * forever if somebody is actively writing to the file. - * - * Do this in two main stages: first we copy dirty buffers to a - * temporary inode list, queueing the writes as we go. Then we clean - * up, waiting for those writes to complete. - * - * During this second stage, any subsequent updates to the file may end - * up refiling the buffer on the original inode's dirty list again, so - * there is a chance we will end up with a buffer queued for write but - * not yet completed on that list. So, as a final cleanup we go through - * the osync code to catch these locked, dirty buffers without requeuing - * any newly dirty buffers for write. - */ -static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) -{ - struct buffer_head *bh; - struct list_head tmp; - struct address_space *mapping; - int err = 0, err2; - - INIT_LIST_HEAD(&tmp); - - spin_lock(lock); - while (!list_empty(list)) { - bh = BH_ENTRY(list->next); - mapping = bh->b_assoc_map; - __remove_assoc_queue(bh); - /* Avoid race with mark_buffer_dirty_inode() which does - * a lockless check and we rely on seeing the dirty bit */ - smp_mb(); - if (buffer_dirty(bh) || buffer_locked(bh)) { - list_add(&bh->b_assoc_buffers, &tmp); - bh->b_assoc_map = mapping; - if (buffer_dirty(bh)) { - get_bh(bh); - spin_unlock(lock); - /* - * Ensure any pending I/O completes so that - * ll_rw_block() actually writes the current - * contents - it is a noop if I/O is still in - * flight on potentially older contents. - */ - ll_rw_block(SWRITE_SYNC, 1, &bh); - brelse(bh); - spin_lock(lock); - } - } - } - - while (!list_empty(&tmp)) { - bh = BH_ENTRY(tmp.prev); - get_bh(bh); - mapping = bh->b_assoc_map; - __remove_assoc_queue(bh); - /* Avoid race with mark_buffer_dirty_inode() which does - * a lockless check and we rely on seeing the dirty bit */ - smp_mb(); - if (buffer_dirty(bh)) { - list_add(&bh->b_assoc_buffers, - &mapping->private_list); - bh->b_assoc_map = mapping; - } - spin_unlock(lock); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - err = -EIO; - brelse(bh); - spin_lock(lock); - } - - spin_unlock(lock); - err2 = osync_buffers_list(lock, list); - if (err) - return err; - else - return err2; -} - -/* - * Invalidate any and all dirty buffers on a given inode. We are - * probably unmounting the fs, but that doesn't mean we have already - * done a sync(). Just drop the buffers from the inode list. - * - * NOTE: we take the inode's blockdev's mapping's private_lock. Which - * assumes that all the buffers are against the blockdev. Not true - * for reiserfs. - */ -void invalidate_inode_buffers(struct inode *inode) -{ - if (inode_has_buffers(inode)) { - struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->assoc_mapping; - - spin_lock(&buffer_mapping->private_lock); - while (!list_empty(list)) - __remove_assoc_queue(BH_ENTRY(list->next)); - spin_unlock(&buffer_mapping->private_lock); - } -} -EXPORT_SYMBOL(invalidate_inode_buffers); - -/* - * Remove any clean buffers from the inode's buffer list. This is called - * when we're trying to free the inode itself. Those buffers can pin it. - * - * Returns true if all buffers were removed. - */ -int remove_inode_buffers(struct inode *inode) -{ - int ret = 1; - - if (inode_has_buffers(inode)) { - struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->assoc_mapping; - - spin_lock(&buffer_mapping->private_lock); - while (!list_empty(list)) { - struct buffer_head *bh = BH_ENTRY(list->next); - if (buffer_dirty(bh)) { - ret = 0; - break; - } - __remove_assoc_queue(bh); - } - spin_unlock(&buffer_mapping->private_lock); - } - return ret; -} - -/* - * Create the appropriate buffers when given a page for data area and - * the size of each buffer.. Use the bh->b_this_page linked list to - * follow the buffers created. Return NULL if unable to create more - * buffers. - * - * The retry flag is used to differentiate async IO (paging, swapping) - * which may not fail from ordinary buffer allocations. - */ -struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - int retry) -{ - struct buffer_head *bh, *head; - long offset; - -try_again: - head = NULL; - offset = PAGE_SIZE; - while ((offset -= size) >= 0) { - bh = alloc_buffer_head(GFP_NOFS); - if (!bh) - goto no_grow; - - bh->b_bdev = NULL; - bh->b_this_page = head; - bh->b_blocknr = -1; - head = bh; - - bh->b_state = 0; - atomic_set(&bh->b_count, 0); - bh->b_private = NULL; - bh->b_size = size; - - /* Link the buffer to its page */ - set_bh_page(bh, page, offset); - - init_buffer(bh, NULL, NULL); - } - return head; -/* - * In case anything failed, we just free everything we got. - */ -no_grow: - if (head) { - do { - bh = head; - head = head->b_this_page; - free_buffer_head(bh); - } while (head); - } - - /* - * Return failure for non-async IO requests. Async IO requests - * are not allowed to fail, so we have to wait until buffer heads - * become available. But we don't want tasks sleeping with - * partially complete buffers, so all were released above. - */ - if (!retry) - return NULL; - - /* We're _really_ low on memory. Now we just - * wait for old buffer heads to become free due to - * finishing IO. Since this is an async request and - * the reserve list is empty, we're sure there are - * async buffer heads in use. - */ - free_more_memory(); - goto try_again; -} -EXPORT_SYMBOL_GPL(alloc_page_buffers); - -static inline void -link_dev_buffers(struct page *page, struct buffer_head *head) -{ - struct buffer_head *bh, *tail; - - bh = head; - do { - tail = bh; - bh = bh->b_this_page; - } while (bh); - tail->b_this_page = head; - attach_page_buffers(page, head); -} - -/* - * Initialise the state of a blockdev page's buffers. - */ -static void -init_page_buffers(struct page *page, struct block_device *bdev, - sector_t block, int size) -{ - struct buffer_head *head = page_buffers(page); - struct buffer_head *bh = head; - int uptodate = PageUptodate(page); - - do { - if (!buffer_mapped(bh)) { - init_buffer(bh, NULL, NULL); - bh->b_bdev = bdev; - bh->b_blocknr = block; - if (uptodate) - set_buffer_uptodate(bh); - set_buffer_mapped(bh); - } - block++; - bh = bh->b_this_page; - } while (bh != head); -} - -/* - * Create the page-cache page that contains the requested block. - * - * This is user purely for blockdev mappings. - */ -static struct page * -grow_dev_page(struct block_device *bdev, sector_t block, - pgoff_t index, int size) -{ - struct inode *inode = bdev->bd_inode; - struct page *page; - struct buffer_head *bh; - -#ifdef DDE_LINUX - WARN_UNIMPL; - return NULL; -#endif - - page = find_or_create_page(inode->i_mapping, index, - (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); - if (!page) - return NULL; - - BUG_ON(!PageLocked(page)); - - if (page_has_buffers(page)) { - bh = page_buffers(page); - if (bh->b_size == size) { - init_page_buffers(page, bdev, block, size); - return page; - } - if (!try_to_free_buffers(page)) - goto failed; - } - - /* - * Allocate some buffers for this page - */ - bh = alloc_page_buffers(page, size, 0); - if (!bh) - goto failed; - - /* - * Link the page to the buffers and initialise them. Take the - * lock to be atomic wrt __find_get_block(), which does not - * run under the page lock. - */ - spin_lock(&inode->i_mapping->private_lock); - link_dev_buffers(page, bh); - init_page_buffers(page, bdev, block, size); - spin_unlock(&inode->i_mapping->private_lock); - return page; - -failed: - BUG(); - unlock_page(page); - page_cache_release(page); - return NULL; -} - -/* - * Create buffers for the specified block device block's page. If - * that page was dirty, the buffers are set dirty also. - */ -static int -grow_buffers(struct block_device *bdev, sector_t block, int size) -{ - struct page *page; - pgoff_t index; - int sizebits; - - sizebits = -1; - do { - sizebits++; - } while ((size << sizebits) < PAGE_SIZE); - - index = block >> sizebits; - - /* - * Check for a block which wants to lie outside our maximum possible - * pagecache index. (this comparison is done using sector_t types). - */ - if (unlikely(index != block >> sizebits)) { - char b[BDEVNAME_SIZE]; - - printk(KERN_ERR "%s: requested out-of-range block %llu for " - "device %s\n", - __func__, (unsigned long long)block, - bdevname(bdev, b)); - return -EIO; - } - block = index << sizebits; - /* Create a page with the proper size buffers.. */ - page = grow_dev_page(bdev, block, index, size); - if (!page) - return 0; - unlock_page(page); - page_cache_release(page); - return 1; -} - -static struct buffer_head * -__getblk_slow(struct block_device *bdev, sector_t block, int size) -{ - /* Size must be multiple of hard sectorsize */ - if (unlikely(size & (bdev_hardsect_size(bdev)-1) || - (size < 512 || size > PAGE_SIZE))) { - printk(KERN_ERR "getblk(): invalid block size %d requested\n", - size); - printk(KERN_ERR "hardsect size: %d\n", - bdev_hardsect_size(bdev)); - - dump_stack(); - return NULL; - } - - for (;;) { - struct buffer_head * bh; - int ret; - - bh = __find_get_block(bdev, block, size); - if (bh) - return bh; - - ret = grow_buffers(bdev, block, size); - if (ret < 0) - return NULL; - if (ret == 0) - free_more_memory(); - } -} - -/* - * The relationship between dirty buffers and dirty pages: - * - * Whenever a page has any dirty buffers, the page's dirty bit is set, and - * the page is tagged dirty in its radix tree. - * - * At all times, the dirtiness of the buffers represents the dirtiness of - * subsections of the page. If the page has buffers, the page dirty bit is - * merely a hint about the true dirty state. - * - * When a page is set dirty in its entirety, all its buffers are marked dirty - * (if the page has buffers). - * - * When a buffer is marked dirty, its page is dirtied, but the page's other - * buffers are not. - * - * Also. When blockdev buffers are explicitly read with bread(), they - * individually become uptodate. But their backing page remains not - * uptodate - even if all of its buffers are uptodate. A subsequent - * block_read_full_page() against that page will discover all the uptodate - * buffers, will set the page uptodate and will perform no I/O. - */ - -/** - * mark_buffer_dirty - mark a buffer_head as needing writeout - * @bh: the buffer_head to mark dirty - * - * mark_buffer_dirty() will set the dirty bit against the buffer, then set its - * backing page dirty, then tag the page as dirty in its address_space's radix - * tree and then attach the address_space's inode to its superblock's dirty - * inode list. - * - * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->tree_lock and the global inode_lock. - */ -void mark_buffer_dirty(struct buffer_head *bh) -{ -#ifndef DDE_LINUX - WARN_ON_ONCE(!buffer_uptodate(bh)); - - /* - * Very *carefully* optimize the it-is-already-dirty case. - * - * Don't let the final "is it dirty" escape to before we - * perhaps modified the buffer. - */ - if (buffer_dirty(bh)) { - smp_mb(); - if (buffer_dirty(bh)) - return; - } - - if (!test_set_buffer_dirty(bh)) { - struct page *page = bh->b_page; - if (!TestSetPageDirty(page)) - __set_page_dirty(page, page_mapping(page), 0); - } -#else - WARN_UNIMPL; -#endif -} - -/* - * Decrement a buffer_head's reference count. If all buffers against a page - * have zero reference count, are clean and unlocked, and if the page is clean - * and unlocked then try_to_free_buffers() may strip the buffers from the page - * in preparation for freeing it (sometimes, rarely, buffers are removed from - * a page but it ends up not being freed, and buffers may later be reattached). - */ -void __brelse(struct buffer_head * buf) -{ - if (atomic_read(&buf->b_count)) { - put_bh(buf); - return; - } - WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); -} - -/* - * bforget() is like brelse(), except it discards any - * potentially dirty data. - */ -void __bforget(struct buffer_head *bh) -{ - clear_buffer_dirty(bh); - if (bh->b_assoc_map) { - struct address_space *buffer_mapping = bh->b_page->mapping; - - spin_lock(&buffer_mapping->private_lock); - list_del_init(&bh->b_assoc_buffers); - bh->b_assoc_map = NULL; - spin_unlock(&buffer_mapping->private_lock); - } - __brelse(bh); -} - -static struct buffer_head *__bread_slow(struct buffer_head *bh) -{ - lock_buffer(bh); - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - return bh; - } else { - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return bh; - } - brelse(bh); - return NULL; -} - -/* - * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). - * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their - * refcount elevated by one when they're in an LRU. A buffer can only appear - * once in a particular CPU's LRU. A single buffer can be present in multiple - * CPU's LRUs at the same time. - * - * This is a transparent caching front-end to sb_bread(), sb_getblk() and - * sb_find_get_block(). - * - * The LRUs themselves only need locking against invalidate_bh_lrus. We use - * a local interrupt disable for that. - */ - -#define BH_LRU_SIZE 8 - -struct bh_lru { - struct buffer_head *bhs[BH_LRU_SIZE]; -}; - -static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; - -#ifdef CONFIG_SMP -#define bh_lru_lock() local_irq_disable() -#define bh_lru_unlock() local_irq_enable() -#else -#define bh_lru_lock() preempt_disable() -#define bh_lru_unlock() preempt_enable() -#endif - -static inline void check_irqs_on(void) -{ -#ifdef irqs_disabled - BUG_ON(irqs_disabled()); -#endif -} - -/* - * The LRU management algorithm is dopey-but-simple. Sorry. - */ -static void bh_lru_install(struct buffer_head *bh) -{ - struct buffer_head *evictee = NULL; - struct bh_lru *lru; - - check_irqs_on(); - bh_lru_lock(); - lru = &__get_cpu_var(bh_lrus); - if (lru->bhs[0] != bh) { - struct buffer_head *bhs[BH_LRU_SIZE]; - int in; - int out = 0; - - get_bh(bh); - bhs[out++] = bh; - for (in = 0; in < BH_LRU_SIZE; in++) { - struct buffer_head *bh2 = lru->bhs[in]; - - if (bh2 == bh) { - __brelse(bh2); - } else { - if (out >= BH_LRU_SIZE) { - BUG_ON(evictee != NULL); - evictee = bh2; - } else { - bhs[out++] = bh2; - } - } - } - while (out < BH_LRU_SIZE) - bhs[out++] = NULL; - memcpy(lru->bhs, bhs, sizeof(bhs)); - } - bh_lru_unlock(); - - if (evictee) - __brelse(evictee); -} - -/* - * Look up the bh in this cpu's LRU. If it's there, move it to the head. - */ -static struct buffer_head * -lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) -{ - struct buffer_head *ret = NULL; - struct bh_lru *lru; - unsigned int i; - - check_irqs_on(); - bh_lru_lock(); - lru = &__get_cpu_var(bh_lrus); - for (i = 0; i < BH_LRU_SIZE; i++) { - struct buffer_head *bh = lru->bhs[i]; - - if (bh && bh->b_bdev == bdev && - bh->b_blocknr == block && bh->b_size == size) { - if (i) { - while (i) { - lru->bhs[i] = lru->bhs[i - 1]; - i--; - } - lru->bhs[0] = bh; - } - get_bh(bh); - ret = bh; - break; - } - } - bh_lru_unlock(); - return ret; -} - -/* - * Perform a pagecache lookup for the matching buffer. If it's there, refresh - * it in the LRU and mark it as accessed. If it is not present then return - * NULL - */ -struct buffer_head * -__find_get_block(struct block_device *bdev, sector_t block, unsigned size) -{ - struct buffer_head *bh = lookup_bh_lru(bdev, block, size); - - if (bh == NULL) { - bh = __find_get_block_slow(bdev, block); - if (bh) - bh_lru_install(bh); - } - if (bh) - touch_buffer(bh); - return bh; -} -EXPORT_SYMBOL(__find_get_block); - -/* - * __getblk will locate (and, if necessary, create) the buffer_head - * which corresponds to the passed block_device, block and size. The - * returned buffer has its reference count incremented. - * - * __getblk() cannot fail - it just keeps trying. If you pass it an - * illegal block number, __getblk() will happily return a buffer_head - * which represents the non-existent block. Very weird. - * - * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() - * attempt is failing. FIXME, perhaps? - */ -struct buffer_head * -__getblk(struct block_device *bdev, sector_t block, unsigned size) -{ - struct buffer_head *bh = __find_get_block(bdev, block, size); - - might_sleep(); - if (bh == NULL) - bh = __getblk_slow(bdev, block, size); - return bh; -} -EXPORT_SYMBOL(__getblk); - -/* - * Do async read-ahead on a buffer.. - */ -void __breadahead(struct block_device *bdev, sector_t block, unsigned size) -{ - struct buffer_head *bh = __getblk(bdev, block, size); - if (likely(bh)) { - ll_rw_block(READA, 1, &bh); - brelse(bh); - } -} -EXPORT_SYMBOL(__breadahead); - -/** - * __bread() - reads a specified block and returns the bh - * @bdev: the block_device to read from - * @block: number of block - * @size: size (in bytes) to read - * - * Reads a specified block, and returns buffer head that contains it. - * It returns NULL if the block was unreadable. - */ -struct buffer_head * -__bread(struct block_device *bdev, sector_t block, unsigned size) -{ - struct buffer_head *bh = __getblk(bdev, block, size); - - if (likely(bh) && !buffer_uptodate(bh)) - bh = __bread_slow(bh); - return bh; -} -EXPORT_SYMBOL(__bread); - -/* - * invalidate_bh_lrus() is called rarely - but not only at unmount. - * This doesn't race because it runs in each cpu either in irq - * or with preempt disabled. - */ -static void invalidate_bh_lru(void *arg) -{ - struct bh_lru *b = &get_cpu_var(bh_lrus); - int i; - - for (i = 0; i < BH_LRU_SIZE; i++) { - brelse(b->bhs[i]); - b->bhs[i] = NULL; - } - put_cpu_var(bh_lrus); -} - -void invalidate_bh_lrus(void) -{ -#ifndef DDE_LINUX - on_each_cpu(invalidate_bh_lru, NULL, 1); -#endif -} -EXPORT_SYMBOL_GPL(invalidate_bh_lrus); - -void set_bh_page(struct buffer_head *bh, - struct page *page, unsigned long offset) -{ - bh->b_page = page; - BUG_ON(offset >= PAGE_SIZE); - if (PageHighMem(page)) - /* - * This catches illegal uses and preserves the offset: - */ - bh->b_data = (char *)(0 + offset); - else - bh->b_data = page_address(page) + offset; -} -EXPORT_SYMBOL(set_bh_page); - -/* - * Called when truncating a buffer on a page completely. - */ -static void discard_buffer(struct buffer_head * bh) -{ - lock_buffer(bh); - clear_buffer_dirty(bh); - bh->b_bdev = NULL; - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); - unlock_buffer(bh); -} - -/** - * block_invalidatepage - invalidate part of all of a buffer-backed page - * - * @page: the page which is affected - * @offset: the index of the truncation point - * - * block_invalidatepage() is called when all or part of the page has become - * invalidatedby a truncate operation. - * - * block_invalidatepage() does not have to release all buffers, but it must - * ensure that no dirty buffer is left outside @offset and that no I/O - * is underway against any of the blocks which are outside the truncation - * point. Because the caller is about to free (and possibly reuse) those - * blocks on-disk. - */ -void block_invalidatepage(struct page *page, unsigned long offset) -{ - struct buffer_head *head, *bh, *next; - unsigned int curr_off = 0; - - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) - goto out; - - head = page_buffers(page); - bh = head; - do { - unsigned int next_off = curr_off + bh->b_size; - next = bh->b_this_page; - - /* - * is this block fully invalidated? - */ - if (offset <= curr_off) - discard_buffer(bh); - curr_off = next_off; - bh = next; - } while (bh != head); - - /* - * We release buffers only if the entire page is being invalidated. - * The get_block cached value has been unconditionally invalidated, - * so real IO is not possible anymore. - */ - if (offset == 0) - try_to_release_page(page, 0); -out: - return; -} -EXPORT_SYMBOL(block_invalidatepage); - -/* - * We attach and possibly dirty the buffers atomically wrt - * __set_page_dirty_buffers() via private_lock. try_to_free_buffers - * is already excluded via the page lock. - */ -void create_empty_buffers(struct page *page, - unsigned long blocksize, unsigned long b_state) -{ - struct buffer_head *bh, *head, *tail; - - head = alloc_page_buffers(page, blocksize, 1); - bh = head; - do { - bh->b_state |= b_state; - tail = bh; - bh = bh->b_this_page; - } while (bh); - tail->b_this_page = head; - - spin_lock(&page->mapping->private_lock); - if (PageUptodate(page) || PageDirty(page)) { - bh = head; - do { - if (PageDirty(page)) - set_buffer_dirty(bh); - if (PageUptodate(page)) - set_buffer_uptodate(bh); - bh = bh->b_this_page; - } while (bh != head); - } - attach_page_buffers(page, head); - spin_unlock(&page->mapping->private_lock); -} -EXPORT_SYMBOL(create_empty_buffers); - -/* - * We are taking a block for data and we don't want any output from any - * buffer-cache aliases starting from return from that function and - * until the moment when something will explicitly mark the buffer - * dirty (hopefully that will not happen until we will free that block ;-) - * We don't even need to mark it not-uptodate - nobody can expect - * anything from a newly allocated buffer anyway. We used to used - * unmap_buffer() for such invalidation, but that was wrong. We definitely - * don't want to mark the alias unmapped, for example - it would confuse - * anyone who might pick it with bread() afterwards... - * - * Also.. Note that bforget() doesn't lock the buffer. So there can - * be writeout I/O going on against recently-freed buffers. We don't - * wait on that I/O in bforget() - it's more efficient to wait on the I/O - * only if we really need to. That happens here. - */ -void unmap_underlying_metadata(struct block_device *bdev, sector_t block) -{ - struct buffer_head *old_bh; - - might_sleep(); - - old_bh = __find_get_block_slow(bdev, block); - if (old_bh) { - clear_buffer_dirty(old_bh); - wait_on_buffer(old_bh); - clear_buffer_req(old_bh); - __brelse(old_bh); - } -} -EXPORT_SYMBOL(unmap_underlying_metadata); - -/* - * NOTE! All mapped/uptodate combinations are valid: - * - * Mapped Uptodate Meaning - * - * No No "unknown" - must do get_block() - * No Yes "hole" - zero-filled - * Yes No "allocated" - allocated on disk, not read in - * Yes Yes "valid" - allocated and up-to-date in memory. - * - * "Dirty" is valid only with the last case (mapped+uptodate). - */ - -/* - * While block_write_full_page is writing back the dirty buffers under - * the page lock, whoever dirtied the buffers may decide to clean them - * again at any time. We handle that by only looking at the buffer - * state inside lock_buffer(). - * - * If block_write_full_page() is called for regular writeback - * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a - * locked buffer. This only can happen if someone has written the buffer - * directly, with submit_bh(). At the address_space level PageWriteback - * prevents this contention from occurring. - */ -static int __block_write_full_page(struct inode *inode, struct page *page, - get_block_t *get_block, struct writeback_control *wbc) -{ - int err; - sector_t block; - sector_t last_block; - struct buffer_head *bh, *head; - const unsigned blocksize = 1 << inode->i_blkbits; - int nr_underway = 0; - - BUG_ON(!PageLocked(page)); - - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; - - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - - /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers - * here, and the (potentially unmapped) buffers may become dirty at - * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the page stays dirty. - * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; - * handle that here by just cleaning them. - */ - - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - head = page_buffers(page); - bh = head; - - /* - * Get all the dirty buffers mapped to disk addresses and - * handle any aliases from the underlying blockdev's mapping. - */ - do { - if (block > last_block) { - /* - * mapped buffers outside i_size will occur, because - * this page can be outside i_size when there is a - * truncate in progress. - */ - /* - * The buffer was zeroed by block_write_full_page() - */ - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && - buffer_dirty(bh)) { - WARN_ON(bh->b_size != blocksize); - err = get_block(inode, block, bh, 1); - if (err) - goto recover; - clear_buffer_delay(bh); - if (buffer_new(bh)) { - /* blockdev mappings never come here */ - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); - } - } - bh = bh->b_this_page; - block++; - } while (bh != head); - - do { - if (!buffer_mapped(bh)) - continue; - /* - * If it's a fully non-blocking write attempt and we cannot - * lock the buffer then redirty the page. Note that this can - * potentially cause a busy-wait loop from pdflush and kswapd - * activity, but those code paths have their own higher-level - * throttling. - */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { - lock_buffer(bh); - } else if (!trylock_buffer(bh)) { - redirty_page_for_writepage(wbc, page); - continue; - } - if (test_clear_buffer_dirty(bh)) { - mark_buffer_async_write(bh); - } else { - unlock_buffer(bh); - } - } while ((bh = bh->b_this_page) != head); - - /* - * The page and its buffers are protected by PageWriteback(), so we can - * drop the bh refcounts early. - */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - submit_bh(WRITE, bh); - nr_underway++; - } - bh = next; - } while (bh != head); - unlock_page(page); - - err = 0; -done: - if (nr_underway == 0) { - /* - * The page was marked dirty, but the buffers were - * clean. Someone wrote them back by hand with - * ll_rw_block/submit_bh. A rare case. - */ - end_page_writeback(page); - - /* - * The page and buffer_heads can be released at any time from - * here on. - */ - } - return err; - -recover: - /* - * ENOSPC, or some other error. We may already have added some - * blocks to the file, so we need to write these out to avoid - * exposing stale data. - * The page is currently locked and not marked for writeback - */ - bh = head; - /* Recovery: lock and submit the mapped buffers */ - do { - if (buffer_mapped(bh) && buffer_dirty(bh) && - !buffer_delay(bh)) { - lock_buffer(bh); - mark_buffer_async_write(bh); - } else { - /* - * The buffer may have been set dirty during - * attachment to a dirty page. - */ - clear_buffer_dirty(bh); - } - } while ((bh = bh->b_this_page) != head); - SetPageError(page); - BUG_ON(PageWriteback(page)); - mapping_set_error(page->mapping, err); - set_page_writeback(page); - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - clear_buffer_dirty(bh); - submit_bh(WRITE, bh); - nr_underway++; - } - bh = next; - } while (bh != head); - unlock_page(page); - goto done; -} - -/* - * If a page has any new buffers, zero them out here, and mark them uptodate - * and dirty so they'll be written out (in order to prevent uninitialised - * block data from leaking). And clear the new bit. - */ -void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) -{ - unsigned int block_start, block_end; - struct buffer_head *head, *bh; - - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) - return; - - bh = head = page_buffers(page); - block_start = 0; - do { - block_end = block_start + bh->b_size; - - if (buffer_new(bh)) { - if (block_end > from && block_start < to) { - if (!PageUptodate(page)) { - unsigned start, size; - - start = max(from, block_start); - size = min(to, block_end) - start; - - zero_user(page, start, size); - set_buffer_uptodate(bh); - } - - clear_buffer_new(bh); - mark_buffer_dirty(bh); - } - } - - block_start = block_end; - bh = bh->b_this_page; - } while (bh != head); -} -EXPORT_SYMBOL(page_zero_new_buffers); - -static int __block_prepare_write(struct inode *inode, struct page *page, - unsigned from, unsigned to, get_block_t *get_block) -{ - unsigned block_start, block_end; - sector_t block; - int err = 0; - unsigned blocksize, bbits; - struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; - - BUG_ON(!PageLocked(page)); - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); - BUG_ON(from > to); - - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); - - bbits = inode->i_blkbits; - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); - - for(bh = head, block_start = 0; bh != head || !block_start; - block++, block_start=block_end, bh = bh->b_this_page) { - block_end = block_start + blocksize; - if (block_end <= from || block_start >= to) { - if (PageUptodate(page)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - } - continue; - } - if (buffer_new(bh)) - clear_buffer_new(bh); - if (!buffer_mapped(bh)) { - WARN_ON(bh->b_size != blocksize); - err = get_block(inode, block, bh, 1); - if (err) - break; - if (buffer_new(bh)) { - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); - if (PageUptodate(page)) { - clear_buffer_new(bh); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - continue; - } - if (block_end > to || block_start < from) - zero_user_segments(page, - to, block_end, - block_start, from); - continue; - } - } - if (PageUptodate(page)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - continue; - } - if (!buffer_uptodate(bh) && !buffer_delay(bh) && - !buffer_unwritten(bh) && - (block_start < from || block_end > to)) { - ll_rw_block(READ, 1, &bh); - *wait_bh++=bh; - } - } - /* - * If we issued read requests - let them complete. - */ - while(wait_bh > wait) { - wait_on_buffer(*--wait_bh); - if (!buffer_uptodate(*wait_bh)) - err = -EIO; - } - if (unlikely(err)) - page_zero_new_buffers(page, from, to); - return err; -} - -static int __block_commit_write(struct inode *inode, struct page *page, - unsigned from, unsigned to) -{ - unsigned block_start, block_end; - int partial = 0; - unsigned blocksize; - struct buffer_head *bh, *head; - - blocksize = 1 << inode->i_blkbits; - - for(bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) { - block_end = block_start + blocksize; - if (block_end <= from || block_start >= to) { - if (!buffer_uptodate(bh)) - partial = 1; - } else { - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - } - clear_buffer_new(bh); - } - - /* - * If this is a partial write which happened to make all buffers - * uptodate then we can optimize away a bogus readpage() for - * the next read(). Here we 'discover' whether the page went - * uptodate as a result of this (potentially partial) write. - */ - if (!partial) - SetPageUptodate(page); - return 0; -} - -/* - * block_write_begin takes care of the basic task of block allocation and - * bringing partial write blocks uptodate first. - * - * If *pagep is not NULL, then block_write_begin uses the locked page - * at *pagep rather than allocating its own. In this case, the page will - * not be unlocked or deallocated on failure. - */ -int block_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata, - get_block_t *get_block) -{ -#ifndef DDE_LINUX - struct inode *inode = mapping->host; - int status = 0; - struct page *page; - pgoff_t index; - unsigned start, end; - int ownpage = 0; - - index = pos >> PAGE_CACHE_SHIFT; - start = pos & (PAGE_CACHE_SIZE - 1); - end = start + len; - - page = *pagep; - if (page == NULL) { - ownpage = 1; - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { - status = -ENOMEM; - goto out; - } - *pagep = page; - } else - BUG_ON(!PageLocked(page)); - - status = __block_prepare_write(inode, page, start, end, get_block); - if (unlikely(status)) { - ClearPageUptodate(page); - - if (ownpage) { - unlock_page(page); - page_cache_release(page); - *pagep = NULL; - -#ifndef DDE_LINUX - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - */ - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); -#endif - } - } - -out: - return status; -#else - WARN_UNIMPL; - return -1; -#endif -} -EXPORT_SYMBOL(block_write_begin); - -int block_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = mapping->host; - unsigned start; - - start = pos & (PAGE_CACHE_SIZE - 1); - - if (unlikely(copied < len)) { - /* - * The buffers that were written will now be uptodate, so we - * don't have to worry about a readpage reading them and - * overwriting a partial write. However if we have encountered - * a short write and only partially written into a buffer, it - * will not be marked uptodate, so a readpage might come in and - * destroy our partial write. - * - * Do the simplest thing, and just treat any short write to a - * non uptodate page as a zero-length write, and force the - * caller to redo the whole thing. - */ - if (!PageUptodate(page)) - copied = 0; - - page_zero_new_buffers(page, start+copied, start+len); - } - flush_dcache_page(page); - - /* This could be a short (even 0-length) commit */ - __block_commit_write(inode, page, start, start+copied); - - return copied; -} -EXPORT_SYMBOL(block_write_end); - -int generic_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = mapping->host; - int i_size_changed = 0; - - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - * - * But it's important to update i_size while still holding page lock: - * page writeout could otherwise come in and zero beyond i_size. - */ - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - i_size_changed = 1; - } - - unlock_page(page); - page_cache_release(page); - - /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling - * filesystems. - */ - if (i_size_changed) - mark_inode_dirty(inode); - - return copied; -} -EXPORT_SYMBOL(generic_write_end); - -/* - * block_is_partially_uptodate checks whether buffers within a page are - * uptodate or not. - * - * Returns true if all buffers which correspond to a file portion - * we want to read are uptodate. - */ -int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, - unsigned long from) -{ - struct inode *inode = page->mapping->host; - unsigned block_start, block_end, blocksize; - unsigned to; - struct buffer_head *bh, *head; - int ret = 1; - - if (!page_has_buffers(page)) - return 0; - - blocksize = 1 << inode->i_blkbits; - to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); - to = from + to; - if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) - return 0; - - head = page_buffers(page); - bh = head; - block_start = 0; - do { - block_end = block_start + blocksize; - if (block_end > from && block_start < to) { - if (!buffer_uptodate(bh)) { - ret = 0; - break; - } - if (block_end >= to) - break; - } - block_start = block_end; - bh = bh->b_this_page; - } while (bh != head); - - return ret; -} -EXPORT_SYMBOL(block_is_partially_uptodate); - -/* - * Generic "read page" function for block devices that have the normal - * get_block functionality. This is most of the block device filesystems. - * Reads the page asynchronously --- the unlock_buffer() and - * set/clear_buffer_uptodate() functions propagate buffer state into the - * page struct once IO has completed. - */ -int block_read_full_page(struct page *page, get_block_t *get_block) -{ - struct inode *inode = page->mapping->host; - sector_t iblock, lblock; - struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - unsigned int blocksize; - int nr, i; - int fully_mapped = 1; - - BUG_ON(!PageLocked(page)); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); - - iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; - bh = head; - nr = 0; - i = 0; - - do { - if (buffer_uptodate(bh)) - continue; - - if (!buffer_mapped(bh)) { - int err = 0; - - fully_mapped = 0; - if (iblock < lblock) { - WARN_ON(bh->b_size != blocksize); - err = get_block(inode, iblock, bh, 0); - if (err) - SetPageError(page); - } - if (!buffer_mapped(bh)) { - zero_user(page, i * blocksize, blocksize); - if (!err) - set_buffer_uptodate(bh); - continue; - } - /* - * get_block() might have updated the buffer - * synchronously - */ - if (buffer_uptodate(bh)) - continue; - } - arr[nr++] = bh; - } while (i++, iblock++, (bh = bh->b_this_page) != head); - - if (fully_mapped) - SetPageMappedToDisk(page); - - if (!nr) { - /* - * All buffers are uptodate - we can set the page uptodate - * as well. But not if get_block() returned an error. - */ - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); - return 0; - } - - /* Stage two: lock the buffers */ - for (i = 0; i < nr; i++) { - bh = arr[i]; - lock_buffer(bh); - mark_buffer_async_read(bh); - } - - /* - * Stage 3: start the IO. Check for uptodateness - * inside the buffer lock in case another process reading - * the underlying blockdev brought it uptodate (the sct fix). - */ - for (i = 0; i < nr; i++) { - bh = arr[i]; - if (buffer_uptodate(bh)) - end_buffer_async_read(bh, 1); - else - submit_bh(READ, bh); - } - return 0; -} - -/* utility function for filesystems that need to do work on expanding - * truncates. Uses filesystem pagecache writes to allow the filesystem to - * deal with the hole. - */ -int generic_cont_expand_simple(struct inode *inode, loff_t size) -{ - struct address_space *mapping = inode->i_mapping; - struct page *page; - void *fsdata; - unsigned long limit; - int err; - - err = -EFBIG; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && size > (loff_t)limit) { - send_sig(SIGXFSZ, current, 0); - goto out; - } - if (size > inode->i_sb->s_maxbytes) - goto out; - - err = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, - &page, &fsdata); - if (err) - goto out; - - err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); - BUG_ON(err > 0); - -out: - return err; -} - -static int cont_expand_zero(struct file *file, struct address_space *mapping, - loff_t pos, loff_t *bytes) -{ - struct inode *inode = mapping->host; - unsigned blocksize = 1 << inode->i_blkbits; - struct page *page; - void *fsdata; - pgoff_t index, curidx; - loff_t curpos; - unsigned zerofrom, offset, len; - int err = 0; - - index = pos >> PAGE_CACHE_SHIFT; - offset = pos & ~PAGE_CACHE_MASK; - - while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { - zerofrom = curpos & ~PAGE_CACHE_MASK; - if (zerofrom & (blocksize-1)) { - *bytes |= (blocksize-1); - (*bytes)++; - } - len = PAGE_CACHE_SIZE - zerofrom; - - err = pagecache_write_begin(file, mapping, curpos, len, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (err) - goto out; - zero_user(page, zerofrom, len); - err = pagecache_write_end(file, mapping, curpos, len, len, - page, fsdata); - if (err < 0) - goto out; - BUG_ON(err != len); - err = 0; - - balance_dirty_pages_ratelimited(mapping); - } - - /* page covers the boundary, find the boundary offset */ - if (index == curidx) { - zerofrom = curpos & ~PAGE_CACHE_MASK; - /* if we will expand the thing last block will be filled */ - if (offset <= zerofrom) { - goto out; - } - if (zerofrom & (blocksize-1)) { - *bytes |= (blocksize-1); - (*bytes)++; - } - len = offset - zerofrom; - - err = pagecache_write_begin(file, mapping, curpos, len, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (err) - goto out; - zero_user(page, zerofrom, len); - err = pagecache_write_end(file, mapping, curpos, len, len, - page, fsdata); - if (err < 0) - goto out; - BUG_ON(err != len); - err = 0; - } -out: - return err; -} - -/* - * For moronic filesystems that do not allow holes in file. - * We may have to extend the file. - */ -int cont_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata, - get_block_t *get_block, loff_t *bytes) -{ - struct inode *inode = mapping->host; - unsigned blocksize = 1 << inode->i_blkbits; - unsigned zerofrom; - int err; - - err = cont_expand_zero(file, mapping, pos, bytes); - if (err) - goto out; - - zerofrom = *bytes & ~PAGE_CACHE_MASK; - if (pos+len > *bytes && zerofrom & (blocksize-1)) { - *bytes |= (blocksize-1); - (*bytes)++; - } - - *pagep = NULL; - err = block_write_begin(file, mapping, pos, len, - flags, pagep, fsdata, get_block); -out: - return err; -} - -int block_prepare_write(struct page *page, unsigned from, unsigned to, - get_block_t *get_block) -{ - struct inode *inode = page->mapping->host; - int err = __block_prepare_write(inode, page, from, to, get_block); - if (err) - ClearPageUptodate(page); - return err; -} - -int block_commit_write(struct page *page, unsigned from, unsigned to) -{ - struct inode *inode = page->mapping->host; - __block_commit_write(inode,page,from,to); - return 0; -} - -/* - * block_page_mkwrite() is not allowed to change the file size as it gets - * called from a page fault handler when a page is first dirtied. Hence we must - * be careful to check for EOF conditions here. We set the page up correctly - * for a written page which means we get ENOSPC checking when writing into - * holes and correct delalloc and unwritten extent mapping on filesystems that - * support these features. - * - * We are not allowed to take the i_mutex here so we have to play games to - * protect against truncate races as the page could now be beyond EOF. Because - * vmtruncate() writes the inode size before removing pages, once we have the - * page lock we can determine safely if the page is beyond EOF. If it is not - * beyond EOF, then the page is guaranteed safe against truncation until we - * unlock the page. - */ -int -block_page_mkwrite(struct vm_area_struct *vma, struct page *page, - get_block_t get_block) -{ - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; - unsigned long end; - loff_t size; - int ret = -EINVAL; - - lock_page(page); - size = i_size_read(inode); - if ((page->mapping != inode->i_mapping) || - (page_offset(page) > size)) { - /* page got truncated out from underneath us */ - goto out_unlock; - } - - /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) - end = size & ~PAGE_CACHE_MASK; - else - end = PAGE_CACHE_SIZE; - - ret = block_prepare_write(page, 0, end, get_block); - if (!ret) - ret = block_commit_write(page, 0, end); - -out_unlock: - unlock_page(page); - return ret; -} - -/* - * nobh_write_begin()'s prereads are special: the buffer_heads are freed - * immediately, while under the page lock. So it needs a special end_io - * handler which does not touch the bh after unlocking it. - */ -static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) -{ - __end_buffer_read_notouch(bh, uptodate); -} - -/* - * Attach the singly-linked list of buffers created by nobh_write_begin, to - * the page (converting it to circular linked list and taking care of page - * dirty races). - */ -static void attach_nobh_buffers(struct page *page, struct buffer_head *head) -{ - struct buffer_head *bh; - - BUG_ON(!PageLocked(page)); - - spin_lock(&page->mapping->private_lock); - bh = head; - do { - if (PageDirty(page)) - set_buffer_dirty(bh); - if (!bh->b_this_page) - bh->b_this_page = head; - bh = bh->b_this_page; - } while (bh != head); - attach_page_buffers(page, head); - spin_unlock(&page->mapping->private_lock); -} - -/* - * On entry, the page is fully not uptodate. - * On exit the page is fully uptodate in the areas outside (from,to) - */ -int nobh_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata, - get_block_t *get_block) -{ - struct inode *inode = mapping->host; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocksize = 1 << blkbits; - struct buffer_head *head, *bh; - struct page *page; - pgoff_t index; - unsigned from, to; - unsigned block_in_page; - unsigned block_start, block_end; - sector_t block_in_file; - int nr_reads = 0; - int ret = 0; - int is_mapped_to_disk = 1; - - index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - *pagep = page; - *fsdata = NULL; - - if (page_has_buffers(page)) { - unlock_page(page); - page_cache_release(page); - *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, - fsdata, get_block); - } - - if (PageMappedToDisk(page)) - return 0; - - /* - * Allocate buffers so that we can keep track of state, and potentially - * attach them to the page if an error occurs. In the common case of - * no error, they will just be freed again without ever being attached - * to the page (which is all OK, because we're under the page lock). - * - * Be careful: the buffer linked list is a NULL terminated one, rather - * than the circular one we're used to. - */ - head = alloc_page_buffers(page, blocksize, 0); - if (!head) { - ret = -ENOMEM; - goto out_release; - } - - block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); - - /* - * We loop across all blocks in the page, whether or not they are - * part of the affected region. This is so we can discover if the - * page is fully mapped-to-disk. - */ - for (block_start = 0, block_in_page = 0, bh = head; - block_start < PAGE_CACHE_SIZE; - block_in_page++, block_start += blocksize, bh = bh->b_this_page) { - int create; - - block_end = block_start + blocksize; - bh->b_state = 0; - create = 1; - if (block_start >= to) - create = 0; - ret = get_block(inode, block_in_file + block_in_page, - bh, create); - if (ret) - goto failed; - if (!buffer_mapped(bh)) - is_mapped_to_disk = 0; - if (buffer_new(bh)) - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); - if (PageUptodate(page)) { - set_buffer_uptodate(bh); - continue; - } - if (buffer_new(bh) || !buffer_mapped(bh)) { - zero_user_segments(page, block_start, from, - to, block_end); - continue; - } - if (buffer_uptodate(bh)) - continue; /* reiserfs does this */ - if (block_start < from || block_end > to) { - lock_buffer(bh); - bh->b_end_io = end_buffer_read_nobh; - submit_bh(READ, bh); - nr_reads++; - } - } - - if (nr_reads) { - /* - * The page is locked, so these buffers are protected from - * any VM or truncate activity. Hence we don't need to care - * for the buffer_head refcounts. - */ - for (bh = head; bh; bh = bh->b_this_page) { - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - ret = -EIO; - } - if (ret) - goto failed; - } - - if (is_mapped_to_disk) - SetPageMappedToDisk(page); - - *fsdata = head; /* to be released by nobh_write_end */ - - return 0; - -failed: - BUG_ON(!ret); - /* - * Error recovery is a bit difficult. We need to zero out blocks that - * were newly allocated, and dirty them to ensure they get written out. - * Buffers need to be attached to the page at this point, otherwise - * the handling of potential IO errors during writeout would be hard - * (could try doing synchronous writeout, but what if that fails too?) - */ - attach_nobh_buffers(page, head); - page_zero_new_buffers(page, from, to); - -out_release: - unlock_page(page); - page_cache_release(page); - *pagep = NULL; - - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); - - return ret; -} -EXPORT_SYMBOL(nobh_write_begin); - -int nobh_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - struct buffer_head *head = fsdata; - struct buffer_head *bh; - BUG_ON(fsdata != NULL && page_has_buffers(page)); - - if (unlikely(copied < len) && head) - attach_nobh_buffers(page, head); - if (page_has_buffers(page)) - return generic_write_end(file, mapping, pos, len, - copied, page, fsdata); - - SetPageUptodate(page); - set_page_dirty(page); - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - mark_inode_dirty(inode); - } - - unlock_page(page); - page_cache_release(page); - - while (head) { - bh = head; - head = head->b_this_page; - free_buffer_head(bh); - } - - return copied; -} -EXPORT_SYMBOL(nobh_write_end); - -/* - * nobh_writepage() - based on block_full_write_page() except - * that it tries to operate without attaching bufferheads to - * the page. - */ -int nobh_writepage(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) -{ - struct inode * const inode = page->mapping->host; - loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned offset; - int ret; - - /* Is the page fully inside i_size? */ - if (page->index < end_index) - goto out; - - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_CACHE_SIZE-1); - if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ -#if 0 - /* Not really sure about this - do we need this ? */ - if (page->mapping->a_ops->invalidatepage) - page->mapping->a_ops->invalidatepage(page, offset); -#endif - unlock_page(page); - return 0; /* don't care */ - } - - /* - * The page straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - zero_user_segment(page, offset, PAGE_CACHE_SIZE); -out: - ret = mpage_writepage(page, get_block, wbc); - if (ret == -EAGAIN) - ret = __block_write_full_page(inode, page, get_block, wbc); - return ret; -} -EXPORT_SYMBOL(nobh_writepage); - -int nobh_truncate_page(struct address_space *mapping, - loff_t from, get_block_t *get_block) -{ - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize; - sector_t iblock; - unsigned length, pos; - struct inode *inode = mapping->host; - struct page *page; - struct buffer_head map_bh; - int err; - - blocksize = 1 << inode->i_blkbits; - length = offset & (blocksize - 1); - - /* Block boundary? Nothing to do */ - if (!length) - return 0; - - length = blocksize - length; - iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - - page = grab_cache_page(mapping, index); - err = -ENOMEM; - if (!page) - goto out; - - if (page_has_buffers(page)) { -has_buffers: - unlock_page(page); - page_cache_release(page); - return block_truncate_page(mapping, from, get_block); - } - - /* Find the buffer that contains "offset" */ - pos = blocksize; - while (offset >= pos) { - iblock++; - pos += blocksize; - } - - err = get_block(inode, iblock, &map_bh, 0); - if (err) - goto unlock; - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(&map_bh)) - goto unlock; - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (!PageUptodate(page)) { - err = mapping->a_ops->readpage(NULL, page); - if (err) { - page_cache_release(page); - goto out; - } - lock_page(page); - if (!PageUptodate(page)) { - err = -EIO; - goto unlock; - } - if (page_has_buffers(page)) - goto has_buffers; - } - zero_user(page, offset, length); - set_page_dirty(page); - err = 0; - -unlock: - unlock_page(page); - page_cache_release(page); -out: - return err; -} -EXPORT_SYMBOL(nobh_truncate_page); - -int block_truncate_page(struct address_space *mapping, - loff_t from, get_block_t *get_block) -{ - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize; - sector_t iblock; - unsigned length, pos; - struct inode *inode = mapping->host; - struct page *page; - struct buffer_head *bh; - int err; - - blocksize = 1 << inode->i_blkbits; - length = offset & (blocksize - 1); - - /* Block boundary? Nothing to do */ - if (!length) - return 0; - - length = blocksize - length; - iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - - page = grab_cache_page(mapping, index); - err = -ENOMEM; - if (!page) - goto out; - - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - - /* Find the buffer that contains "offset" */ - bh = page_buffers(page); - pos = blocksize; - while (offset >= pos) { - bh = bh->b_this_page; - iblock++; - pos += blocksize; - } - - err = 0; - if (!buffer_mapped(bh)) { - WARN_ON(bh->b_size != blocksize); - err = get_block(inode, iblock, bh, 0); - if (err) - goto unlock; - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) - goto unlock; - } - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) - goto unlock; - } - - zero_user(page, offset, length); - mark_buffer_dirty(bh); - err = 0; - -unlock: - unlock_page(page); - page_cache_release(page); -out: - return err; -} - -/* - * The generic ->writepage function for buffer-backed address_spaces - */ -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) -{ - struct inode * const inode = page->mapping->host; - loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned offset; - - /* Is the page fully inside i_size? */ - if (page->index < end_index) - return __block_write_full_page(inode, page, get_block, wbc); - - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_CACHE_SIZE-1); - if (page->index >= end_index+1 || !offset) { - /* - * The page may have dirty, unmapped buffers. For example, - * they may have been added in ext3_writepage(). Make them - * freeable here, so the page does not leak. - */ - do_invalidatepage(page, 0); - unlock_page(page); - return 0; /* don't care */ - } - - /* - * The page straddles i_size. It must be zeroed out on each and every - * writepage invokation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - zero_user_segment(page, offset, PAGE_CACHE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc); -} - -sector_t generic_block_bmap(struct address_space *mapping, sector_t block, - get_block_t *get_block) -{ - struct buffer_head tmp; - struct inode *inode = mapping->host; - tmp.b_state = 0; - tmp.b_blocknr = 0; - tmp.b_size = 1 << inode->i_blkbits; - get_block(inode, block, &tmp, 0); - return tmp.b_blocknr; -} - -static void end_bio_bh_io_sync(struct bio *bio, int err) -{ - struct buffer_head *bh = bio->bi_private; - - if (err == -EOPNOTSUPP) { - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - set_bit(BH_Eopnotsupp, &bh->b_state); - } - - if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) - set_bit(BH_Quiet, &bh->b_state); - - bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); - bio_put(bio); -} - -int submit_bh(int rw, struct buffer_head * bh) -{ - struct bio *bio; - int ret = 0; - - BUG_ON(!buffer_locked(bh)); - BUG_ON(!buffer_mapped(bh)); - BUG_ON(!bh->b_end_io); - - /* - * Mask in barrier bit for a write (could be either a WRITE or a - * WRITE_SYNC - */ - if (buffer_ordered(bh) && (rw & WRITE)) - rw |= WRITE_BARRIER; - - /* - * Only clear out a write error when rewriting - */ - if (test_set_buffer_req(bh) && (rw & WRITE)) - clear_buffer_write_io_error(bh); - - /* - * from here on down, it's all bio -- do the initial mapping, - * submit_bio -> generic_make_request may further map this bio around - */ - bio = bio_alloc(GFP_NOIO, 1); - - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; - bio->bi_io_vec[0].bv_page = bh->b_page; - bio->bi_io_vec[0].bv_len = bh->b_size; - bio->bi_io_vec[0].bv_offset = bh_offset(bh); - - bio->bi_vcnt = 1; - bio->bi_idx = 0; - bio->bi_size = bh->b_size; - - bio->bi_end_io = end_bio_bh_io_sync; - bio->bi_private = bh; - - bio_get(bio); - submit_bio(rw, bio); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - - bio_put(bio); - return ret; -} - -/** - * ll_rw_block: low-level access to block devices (DEPRECATED) - * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) - * @nr: number of &struct buffer_heads in the array - * @bhs: array of pointers to &struct buffer_head - * - * ll_rw_block() takes an array of pointers to &struct buffer_heads, and - * requests an I/O operation on them, either a %READ or a %WRITE. The third - * %SWRITE is like %WRITE only we make sure that the *current* data in buffers - * are sent to disk. The fourth %READA option is described in the documentation - * for generic_make_request() which ll_rw_block() calls. - * - * This function drops any buffer that it cannot get a lock on (with the - * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be - * clean when doing a write request, and any buffer that appears to be - * up-to-date when doing read request. Further it marks as clean buffers that - * are processed for writing (the buffer cache won't assume that they are - * actually clean until the buffer gets unlocked). - * - * ll_rw_block sets b_end_io to simple completion handler that marks - * the buffer up-to-date (if approriate), unlocks the buffer and wakes - * any waiters. - * - * All of the buffers must be for the same device, and must also be a - * multiple of the current approved size for the device. - */ -void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) -{ - int i; - - for (i = 0; i < nr; i++) { - struct buffer_head *bh = bhs[i]; - - if (rw == SWRITE || rw == SWRITE_SYNC) - lock_buffer(bh); - else if (!trylock_buffer(bh)) - continue; - - if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { - if (test_clear_buffer_dirty(bh)) { - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - if (rw == SWRITE_SYNC) - submit_bh(WRITE_SYNC, bh); - else - submit_bh(WRITE, bh); - continue; - } - } else { - if (!buffer_uptodate(bh)) { - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(rw, bh); - continue; - } - } - unlock_buffer(bh); - } -} - -/* - * For a data-integrity writeout, we need to wait upon any in-progress I/O - * and then start new I/O and then wait upon it. The caller must have a ref on - * the buffer_head. - */ -int sync_dirty_buffer(struct buffer_head *bh) -{ - int ret = 0; - - WARN_ON(atomic_read(&bh->b_count) < 1); - lock_buffer(bh); - if (test_clear_buffer_dirty(bh)) { - get_bh(bh); - bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(WRITE, bh); - wait_on_buffer(bh); - if (buffer_eopnotsupp(bh)) { - clear_buffer_eopnotsupp(bh); - ret = -EOPNOTSUPP; - } - if (!ret && !buffer_uptodate(bh)) - ret = -EIO; - } else { - unlock_buffer(bh); - } - return ret; -} - -/* - * try_to_free_buffers() checks if all the buffers on this particular page - * are unused, and releases them if so. - * - * Exclusion against try_to_free_buffers may be obtained by either - * locking the page or by holding its mapping's private_lock. - * - * If the page is dirty but all the buffers are clean then we need to - * be sure to mark the page clean as well. This is because the page - * may be against a block device, and a later reattachment of buffers - * to a dirty page will set *all* buffers dirty. Which would corrupt - * filesystem data on the same device. - * - * The same applies to regular filesystem pages: if all the buffers are - * clean then we set the page clean and proceed. To do that, we require - * total exclusion from __set_page_dirty_buffers(). That is obtained with - * private_lock. - * - * try_to_free_buffers() is non-blocking. - */ -static inline int buffer_busy(struct buffer_head *bh) -{ - return atomic_read(&bh->b_count) | - (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); -} - -static int -drop_buffers(struct page *page, struct buffer_head **buffers_to_free) -{ - struct buffer_head *head = page_buffers(page); - struct buffer_head *bh; - - bh = head; - do { - if (buffer_write_io_error(bh) && page->mapping) - set_bit(AS_EIO, &page->mapping->flags); - if (buffer_busy(bh)) - goto failed; - bh = bh->b_this_page; - } while (bh != head); - - do { - struct buffer_head *next = bh->b_this_page; - - if (bh->b_assoc_map) - __remove_assoc_queue(bh); - bh = next; - } while (bh != head); - *buffers_to_free = head; - __clear_page_buffers(page); - return 1; -failed: - return 0; -} - -int try_to_free_buffers(struct page *page) -{ - struct address_space * const mapping = page->mapping; - struct buffer_head *buffers_to_free = NULL; - int ret = 0; - - BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) - return 0; - - if (mapping == NULL) { /* can this still happen? */ - ret = drop_buffers(page, &buffers_to_free); - goto out; - } - - spin_lock(&mapping->private_lock); - ret = drop_buffers(page, &buffers_to_free); - - /* - * If the filesystem writes its buffers by hand (eg ext3) - * then we can have clean buffers against a dirty page. We - * clean the page here; otherwise the VM will never notice - * that the filesystem did any IO at all. - * - * Also, during truncate, discard_buffer will have marked all - * the page's buffers clean. We discover that here and clean - * the page also. - * - * private_lock must be held over this entire operation in order - * to synchronise against __set_page_dirty_buffers and prevent the - * dirty bit from being lost. - */ -#ifndef DDE_LINUX - if (ret) - cancel_dirty_page(page, PAGE_CACHE_SIZE); -#endif - spin_unlock(&mapping->private_lock); -out: - if (buffers_to_free) { - struct buffer_head *bh = buffers_to_free; - - do { - struct buffer_head *next = bh->b_this_page; - free_buffer_head(bh); - bh = next; - } while (bh != buffers_to_free); - } - return ret; -} -EXPORT_SYMBOL(try_to_free_buffers); - -void block_sync_page(struct page *page) -{ - struct address_space *mapping; - - smp_mb(); - mapping = page_mapping(page); - if (mapping) - blk_run_backing_dev(mapping->backing_dev_info, page); -} - -/* - * There are no bdflush tunables left. But distributions are - * still running obsolete flush daemons, so we terminate them here. - * - * Use of bdflush() is deprecated and will be removed in a future kernel. - * The `pdflush' kernel threads fully replace bdflush daemons and this call. - */ -SYSCALL_DEFINE2(bdflush, int, func, long, data) -{ - static int msg_count; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (msg_count < 5) { - msg_count++; - printk(KERN_INFO - "warning: process `%s' used the obsolete bdflush" - " system call\n", current->comm); - printk(KERN_INFO "Fix your initscripts?\n"); - } - - if (func == 1) - do_exit(0); - return 0; -} - -/* - * Buffer-head allocation - */ -static struct kmem_cache *bh_cachep; - -/* - * Once the number of bh's in the machine exceeds this level, we start - * stripping them in writeback. - */ -static int max_buffer_heads; - -int buffer_heads_over_limit; - -struct bh_accounting { - int nr; /* Number of live bh's */ - int ratelimit; /* Limit cacheline bouncing */ -}; - -static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; - -static void recalc_bh_state(void) -{ - int i; - int tot = 0; - - if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) - return; - __get_cpu_var(bh_accounting).ratelimit = 0; - for_each_online_cpu(i) - tot += per_cpu(bh_accounting, i).nr; - buffer_heads_over_limit = (tot > max_buffer_heads); -} - -struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) -{ - struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); - if (ret) { - INIT_LIST_HEAD(&ret->b_assoc_buffers); - get_cpu_var(bh_accounting).nr++; - recalc_bh_state(); - put_cpu_var(bh_accounting); - } - return ret; -} -EXPORT_SYMBOL(alloc_buffer_head); - -void free_buffer_head(struct buffer_head *bh) -{ - BUG_ON(!list_empty(&bh->b_assoc_buffers)); - kmem_cache_free(bh_cachep, bh); - get_cpu_var(bh_accounting).nr--; - recalc_bh_state(); - put_cpu_var(bh_accounting); -} -EXPORT_SYMBOL(free_buffer_head); - -static void buffer_exit_cpu(int cpu) -{ - int i; - struct bh_lru *b = &per_cpu(bh_lrus, cpu); - - for (i = 0; i < BH_LRU_SIZE; i++) { - brelse(b->bhs[i]); - b->bhs[i] = NULL; - } - get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; - per_cpu(bh_accounting, cpu).nr = 0; - put_cpu_var(bh_accounting); -} - -static int buffer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) - buffer_exit_cpu((unsigned long)hcpu); - return NOTIFY_OK; -} - -/** - * bh_uptodate_or_lock - Test whether the buffer is uptodate - * @bh: struct buffer_head - * - * Return true if the buffer is up-to-date and false, - * with the buffer locked, if not. - */ -int bh_uptodate_or_lock(struct buffer_head *bh) -{ - if (!buffer_uptodate(bh)) { - lock_buffer(bh); - if (!buffer_uptodate(bh)) - return 0; - unlock_buffer(bh); - } - return 1; -} -EXPORT_SYMBOL(bh_uptodate_or_lock); - -/** - * bh_submit_read - Submit a locked buffer for reading - * @bh: struct buffer_head - * - * Returns zero on success and -EIO on error. - */ -int bh_submit_read(struct buffer_head *bh) -{ - BUG_ON(!buffer_locked(bh)); - - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - return 0; - } - - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return 0; - return -EIO; -} -EXPORT_SYMBOL(bh_submit_read); - -static void -init_buffer_head(void *data) -{ - struct buffer_head *bh = data; - - memset(bh, 0, sizeof(*bh)); - INIT_LIST_HEAD(&bh->b_assoc_buffers); -} - -void __init buffer_init(void) -{ - int nrpages; - - bh_cachep = kmem_cache_create("buffer_head", - sizeof(struct buffer_head), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD), - init_buffer_head); - - /* - * Limit the bh occupancy to 10% of ZONE_NORMAL - */ - nrpages = (nr_free_buffer_pages() * 10) / 100; - max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); - hotcpu_notifier(buffer_cpu_notify, 0); -} - -EXPORT_SYMBOL(__bforget); -EXPORT_SYMBOL(__brelse); -EXPORT_SYMBOL(__wait_on_buffer); -EXPORT_SYMBOL(block_commit_write); -EXPORT_SYMBOL(block_prepare_write); -EXPORT_SYMBOL(block_page_mkwrite); -EXPORT_SYMBOL(block_read_full_page); -EXPORT_SYMBOL(block_sync_page); -EXPORT_SYMBOL(block_truncate_page); -EXPORT_SYMBOL(block_write_full_page); -EXPORT_SYMBOL(cont_write_begin); -EXPORT_SYMBOL(end_buffer_read_sync); -EXPORT_SYMBOL(end_buffer_write_sync); -EXPORT_SYMBOL(file_fsync); -EXPORT_SYMBOL(fsync_bdev); -EXPORT_SYMBOL(generic_block_bmap); -EXPORT_SYMBOL(generic_cont_expand_simple); -EXPORT_SYMBOL(init_buffer); -EXPORT_SYMBOL(invalidate_bdev); -EXPORT_SYMBOL(ll_rw_block); -EXPORT_SYMBOL(mark_buffer_dirty); -EXPORT_SYMBOL(submit_bh); -EXPORT_SYMBOL(sync_dirty_buffer); -EXPORT_SYMBOL(unlock_buffer); diff --git a/libdde_linux26/lib/src/fs/.svn/text-base/char_dev.c.svn-base b/libdde_linux26/lib/src/fs/.svn/text-base/char_dev.c.svn-base deleted file mode 100644 index 3b8e8b3d..00000000 --- a/libdde_linux26/lib/src/fs/.svn/text-base/char_dev.c.svn-base +++ /dev/null @@ -1,572 +0,0 @@ -/* - * linux/fs/char_dev.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#include <linux/init.h> -#include <linux/fs.h> -#include <linux/kdev_t.h> -#include <linux/slab.h> -#include <linux/string.h> - -#include <linux/major.h> -#include <linux/errno.h> -#include <linux/module.h> -#include <linux/smp_lock.h> -#include <linux/seq_file.h> - -#include <linux/kobject.h> -#include <linux/kobj_map.h> -#include <linux/cdev.h> -#include <linux/mutex.h> -#include <linux/backing-dev.h> - -#ifdef CONFIG_KMOD -#include <linux/kmod.h> -#endif -#include "internal.h" - -#ifdef DDE_LINUX -#include "local.h" -#endif - -/* - * capabilities for /dev/mem, /dev/kmem and similar directly mappable character - * devices - * - permits shared-mmap for read, write and/or exec - * - does not permit private mmap in NOMMU mode (can't do COW) - * - no readahead or I/O queue unplugging required - */ -struct backing_dev_info directly_mappable_cdev_bdi = { - .capabilities = ( -#ifdef CONFIG_MMU - /* permit private copies of the data to be taken */ - BDI_CAP_MAP_COPY | -#endif - /* permit direct mmap, for read, write or exec */ - BDI_CAP_MAP_DIRECT | - BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP), -}; - -static struct kobj_map *cdev_map; - -static DEFINE_MUTEX(chrdevs_lock); - -static struct char_device_struct { - struct char_device_struct *next; - unsigned int major; - unsigned int baseminor; - int minorct; - char name[64]; - struct cdev *cdev; /* will die */ -} *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; - -/* index in the above */ -static inline int major_to_index(int major) -{ - return major % CHRDEV_MAJOR_HASH_SIZE; -} - -#ifdef CONFIG_PROC_FS - -void chrdev_show(struct seq_file *f, off_t offset) -{ - struct char_device_struct *cd; - - if (offset < CHRDEV_MAJOR_HASH_SIZE) { - mutex_lock(&chrdevs_lock); - for (cd = chrdevs[offset]; cd; cd = cd->next) - seq_printf(f, "%3d %s\n", cd->major, cd->name); - mutex_unlock(&chrdevs_lock); - } -} - -#endif /* CONFIG_PROC_FS */ - -/* - * Register a single major with a specified minor range. - * - * If major == 0 this functions will dynamically allocate a major and return - * its number. - * - * If major > 0 this function will attempt to reserve the passed range of - * minors and will return zero on success. - * - * Returns a -ve errno on failure. - */ -static struct char_device_struct * -__register_chrdev_region(unsigned int major, unsigned int baseminor, - int minorct, const char *name) -{ - struct char_device_struct *cd, **cp; - int ret = 0; - int i; - - cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); - if (cd == NULL) - return ERR_PTR(-ENOMEM); - - mutex_lock(&chrdevs_lock); - - /* temporary */ - if (major == 0) { - for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) { - if (chrdevs[i] == NULL) - break; - } - - if (i == 0) { - ret = -EBUSY; - goto out; - } - major = i; - ret = major; - } - - cd->major = major; - cd->baseminor = baseminor; - cd->minorct = minorct; - strlcpy(cd->name, name, sizeof(cd->name)); - - i = major_to_index(major); - - for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) - if ((*cp)->major > major || - ((*cp)->major == major && - (((*cp)->baseminor >= baseminor) || - ((*cp)->baseminor + (*cp)->minorct > baseminor)))) - break; - - /* Check for overlapping minor ranges. */ - if (*cp && (*cp)->major == major) { - int old_min = (*cp)->baseminor; - int old_max = (*cp)->baseminor + (*cp)->minorct - 1; - int new_min = baseminor; - int new_max = baseminor + minorct - 1; - - /* New driver overlaps from the left. */ - if (new_max >= old_min && new_max <= old_max) { - ret = -EBUSY; - goto out; - } - - /* New driver overlaps from the right. */ - if (new_min <= old_max && new_min >= old_min) { - ret = -EBUSY; - goto out; - } - } - - cd->next = *cp; - *cp = cd; - mutex_unlock(&chrdevs_lock); - return cd; -out: - mutex_unlock(&chrdevs_lock); - kfree(cd); - return ERR_PTR(ret); -} - -static struct char_device_struct * -__unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) -{ - struct char_device_struct *cd = NULL, **cp; - int i = major_to_index(major); - - mutex_lock(&chrdevs_lock); - for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) - if ((*cp)->major == major && - (*cp)->baseminor == baseminor && - (*cp)->minorct == minorct) - break; - if (*cp) { - cd = *cp; - *cp = cd->next; - } - mutex_unlock(&chrdevs_lock); - return cd; -} - -/** - * register_chrdev_region() - register a range of device numbers - * @from: the first in the desired range of device numbers; must include - * the major number. - * @count: the number of consecutive device numbers required - * @name: the name of the device or driver. - * - * Return value is zero on success, a negative error code on failure. - */ -int register_chrdev_region(dev_t from, unsigned count, const char *name) -{ - struct char_device_struct *cd; - dev_t to = from + count; - dev_t n, next; - - for (n = from; n < to; n = next) { - next = MKDEV(MAJOR(n)+1, 0); - if (next > to) - next = to; - cd = __register_chrdev_region(MAJOR(n), MINOR(n), - next - n, name); - if (IS_ERR(cd)) - goto fail; - } - return 0; -fail: - to = n; - for (n = from; n < to; n = next) { - next = MKDEV(MAJOR(n)+1, 0); - kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); - } - return PTR_ERR(cd); -} - -/** - * alloc_chrdev_region() - register a range of char device numbers - * @dev: output parameter for first assigned number - * @baseminor: first of the requested range of minor numbers - * @count: the number of minor numbers required - * @name: the name of the associated device or driver - * - * Allocates a range of char device numbers. The major number will be - * chosen dynamically, and returned (along with the first minor number) - * in @dev. Returns zero or a negative error code. - */ -int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, - const char *name) -{ - struct char_device_struct *cd; - cd = __register_chrdev_region(0, baseminor, count, name); - if (IS_ERR(cd)) - return PTR_ERR(cd); - *dev = MKDEV(cd->major, cd->baseminor); - return 0; -} - -/** - * register_chrdev() - Register a major number for character devices. - * @major: major device number or 0 for dynamic allocation - * @name: name of this range of devices - * @fops: file operations associated with this devices - * - * If @major == 0 this functions will dynamically allocate a major and return - * its number. - * - * If @major > 0 this function will attempt to reserve a device with the given - * major number and will return zero on success. - * - * Returns a -ve errno on failure. - * - * The name of this device has nothing to do with the name of the device in - * /dev. It only helps to keep track of the different owners of devices. If - * your module name has only one type of devices it's ok to use e.g. the name - * of the module here. - * - * This function registers a range of 256 minor numbers. The first minor number - * is 0. - */ -int register_chrdev(unsigned int major, const char *name, - const struct file_operations *fops) -{ - struct char_device_struct *cd; - struct cdev *cdev; - char *s; - int err = -ENOMEM; - - cd = __register_chrdev_region(major, 0, 256, name); - if (IS_ERR(cd)) - return PTR_ERR(cd); - - cdev = cdev_alloc(); - if (!cdev) - goto out2; - - cdev->owner = fops->owner; - cdev->ops = fops; - kobject_set_name(&cdev->kobj, "%s", name); - for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/')) - *s = '!'; - - err = cdev_add(cdev, MKDEV(cd->major, 0), 256); - if (err) - goto out; - - cd->cdev = cdev; - - return major ? 0 : cd->major; -out: - kobject_put(&cdev->kobj); -out2: - kfree(__unregister_chrdev_region(cd->major, 0, 256)); - return err; -} - -/** - * unregister_chrdev_region() - return a range of device numbers - * @from: the first in the range of numbers to unregister - * @count: the number of device numbers to unregister - * - * This function will unregister a range of @count device numbers, - * starting with @from. The caller should normally be the one who - * allocated those numbers in the first place... - */ -void unregister_chrdev_region(dev_t from, unsigned count) -{ - dev_t to = from + count; - dev_t n, next; - - for (n = from; n < to; n = next) { - next = MKDEV(MAJOR(n)+1, 0); - if (next > to) - next = to; - kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); - } -} - -void unregister_chrdev(unsigned int major, const char *name) -{ - struct char_device_struct *cd; - cd = __unregister_chrdev_region(major, 0, 256); - if (cd && cd->cdev) - cdev_del(cd->cdev); - kfree(cd); -} - -static DEFINE_SPINLOCK(cdev_lock); - -static struct kobject *cdev_get(struct cdev *p) -{ - struct module *owner = p->owner; - struct kobject *kobj; - - if (owner && !try_module_get(owner)) - return NULL; - kobj = kobject_get(&p->kobj); - if (!kobj) - module_put(owner); - return kobj; -} - -void cdev_put(struct cdev *p) -{ - if (p) { - struct module *owner = p->owner; - kobject_put(&p->kobj); - module_put(owner); - } -} - -/* - * Called every time a character special file is opened - */ -static int chrdev_open(struct inode *inode, struct file *filp) -{ - struct cdev *p; - struct cdev *new = NULL; - int ret = 0; - - spin_lock(&cdev_lock); - p = inode->i_cdev; - if (!p) { - struct kobject *kobj; - int idx; - spin_unlock(&cdev_lock); - kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); - if (!kobj) - return -ENXIO; - new = container_of(kobj, struct cdev, kobj); - spin_lock(&cdev_lock); - /* Check i_cdev again in case somebody beat us to it while - we dropped the lock. */ - p = inode->i_cdev; - if (!p) { - inode->i_cdev = p = new; - inode->i_cindex = idx; - list_add(&inode->i_devices, &p->list); - new = NULL; - } else if (!cdev_get(p)) - ret = -ENXIO; - } else if (!cdev_get(p)) - ret = -ENXIO; - spin_unlock(&cdev_lock); - cdev_put(new); - if (ret) - return ret; - - ret = -ENXIO; - filp->f_op = fops_get(p->ops); - if (!filp->f_op) - goto out_cdev_put; - - if (filp->f_op->open) { - ret = filp->f_op->open(inode,filp); - if (ret) - goto out_cdev_put; - } - - return 0; - - out_cdev_put: - cdev_put(p); - return ret; -} - -void cd_forget(struct inode *inode) -{ - spin_lock(&cdev_lock); - list_del_init(&inode->i_devices); - inode->i_cdev = NULL; - spin_unlock(&cdev_lock); -} - -static void cdev_purge(struct cdev *cdev) -{ - spin_lock(&cdev_lock); - while (!list_empty(&cdev->list)) { - struct inode *inode; - inode = container_of(cdev->list.next, struct inode, i_devices); - list_del_init(&inode->i_devices); - inode->i_cdev = NULL; - } - spin_unlock(&cdev_lock); -} - -/* - * Dummy default file-operations: the only thing this does - * is contain the open that then fills in the correct operations - * depending on the special file... - */ -const struct file_operations def_chr_fops = { - .open = chrdev_open, -}; - -static struct kobject *exact_match(dev_t dev, int *part, void *data) -{ - struct cdev *p = data; - return &p->kobj; -} - -static int exact_lock(dev_t dev, void *data) -{ - struct cdev *p = data; - return cdev_get(p) ? 0 : -1; -} - -/** - * cdev_add() - add a char device to the system - * @p: the cdev structure for the device - * @dev: the first device number for which this device is responsible - * @count: the number of consecutive minor numbers corresponding to this - * device - * - * cdev_add() adds the device represented by @p to the system, making it - * live immediately. A negative error code is returned on failure. - */ -int cdev_add(struct cdev *p, dev_t dev, unsigned count) -{ - p->dev = dev; - p->count = count; - return kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); -} - -static void cdev_unmap(dev_t dev, unsigned count) -{ - kobj_unmap(cdev_map, dev, count); -} - -/** - * cdev_del() - remove a cdev from the system - * @p: the cdev structure to be removed - * - * cdev_del() removes @p from the system, possibly freeing the structure - * itself. - */ -void cdev_del(struct cdev *p) -{ - cdev_unmap(p->dev, p->count); - kobject_put(&p->kobj); -} - - -static void cdev_default_release(struct kobject *kobj) -{ - struct cdev *p = container_of(kobj, struct cdev, kobj); - cdev_purge(p); -} - -static void cdev_dynamic_release(struct kobject *kobj) -{ - struct cdev *p = container_of(kobj, struct cdev, kobj); - cdev_purge(p); - kfree(p); -} - -static struct kobj_type ktype_cdev_default = { - .release = cdev_default_release, -}; - -static struct kobj_type ktype_cdev_dynamic = { - .release = cdev_dynamic_release, -}; - -/** - * cdev_alloc() - allocate a cdev structure - * - * Allocates and returns a cdev structure, or NULL on failure. - */ -struct cdev *cdev_alloc(void) -{ - struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); - if (p) { - INIT_LIST_HEAD(&p->list); - kobject_init(&p->kobj, &ktype_cdev_dynamic); - } - return p; -} - -/** - * cdev_init() - initialize a cdev structure - * @cdev: the structure to initialize - * @fops: the file_operations for this device - * - * Initializes @cdev, remembering @fops, making it ready to add to the - * system with cdev_add(). - */ -void cdev_init(struct cdev *cdev, const struct file_operations *fops) -{ - memset(cdev, 0, sizeof *cdev); - INIT_LIST_HEAD(&cdev->list); - kobject_init(&cdev->kobj, &ktype_cdev_default); - cdev->ops = fops; -} - -static struct kobject *base_probe(dev_t dev, int *part, void *data) -{ - if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) - /* Make old-style 2.4 aliases work */ - request_module("char-major-%d", MAJOR(dev)); - return NULL; -} - -void __init chrdev_init(void) -{ - cdev_map = kobj_map_init(base_probe, &chrdevs_lock); - bdi_init(&directly_mappable_cdev_bdi); -} - -#ifndef LIBINPUT -core_initcall(chrdev_init); -#endif - -/* Let modules do char dev stuff */ -EXPORT_SYMBOL(register_chrdev_region); -EXPORT_SYMBOL(unregister_chrdev_region); -EXPORT_SYMBOL(alloc_chrdev_region); -EXPORT_SYMBOL(cdev_init); -EXPORT_SYMBOL(cdev_alloc); -EXPORT_SYMBOL(cdev_del); -EXPORT_SYMBOL(cdev_add); -EXPORT_SYMBOL(register_chrdev); -EXPORT_SYMBOL(unregister_chrdev); -EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/libdde_linux26/lib/src/kernel/.svn/all-wcprops b/libdde_linux26/lib/src/kernel/.svn/all-wcprops deleted file mode 100644 index 38a4b95e..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/all-wcprops +++ /dev/null @@ -1,77 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 65 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel -END -sys.c -K 25 -svn:wc:ra_dav:version-url -V 71 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/sys.c -END -time.c -K 25 -svn:wc:ra_dav:version-url -V 72 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/time.c -END -timeconst.pl -K 25 -svn:wc:ra_dav:version-url -V 78 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/timeconst.pl -END -cred-internals.h -K 25 -svn:wc:ra_dav:version-url -V 82 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/cred-internals.h -END -timer.c -K 25 -svn:wc:ra_dav:version-url -V 73 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/timer.c -END -capability.c -K 25 -svn:wc:ra_dav:version-url -V 78 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/capability.c -END -wait.c -K 25 -svn:wc:ra_dav:version-url -V 72 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/wait.c -END -sched.c -K 25 -svn:wc:ra_dav:version-url -V 73 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/sched.c -END -resource.c -K 25 -svn:wc:ra_dav:version-url -V 76 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/resource.c -END -workqueue.c -K 25 -svn:wc:ra_dav:version-url -V 77 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/workqueue.c -END -exit.c -K 25 -svn:wc:ra_dav:version-url -V 72 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/exit.c -END -sched_cpupri.h -K 25 -svn:wc:ra_dav:version-url -V 80 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/kernel/sched_cpupri.h -END diff --git a/libdde_linux26/lib/src/kernel/.svn/entries b/libdde_linux26/lib/src/kernel/.svn/entries deleted file mode 100644 index 60cbc088..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/entries +++ /dev/null @@ -1,436 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/kernel -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -sys.c -file - - - - -2009-11-15T17:17:07.000000Z -df3ddb7c9aa610d5c7d089ceb7991677 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -43082 - -time.c -file - - - - -2009-11-15T17:17:07.000000Z -eba8d029c2efda8fc995335e9fd5f641 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -19850 - -timeconst.pl -file - - - - -2009-11-15T17:17:07.000000Z -2a9c2def11c2f688e37e08bbbeb6ffe4 -2009-05-20T14:32:55.606606Z -455 -l4check -has-props - - - - - - - - - - - - - - - - - - - - -7425 - -cred-internals.h -file - - - - -2009-11-15T17:17:07.000000Z -b93463f99c8458afb558997a4676812b -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -559 - -timer.c -file - - - - -2009-11-15T17:17:07.000000Z -e4ad2e7b9720b26088054d0ecd4fba26 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -42197 - -capability.c -file - - - - -2009-11-15T17:17:07.000000Z -035383b42b24ee2860d993343f8b2487 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -8420 - -wait.c -file - - - - -2009-11-15T17:17:07.000000Z -b18464408b6aebcb02337450e82df60d -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -8489 - -sched.c -file - - - - -2009-11-15T17:17:07.000000Z -2545ed631dc66c14557dd29de095935d -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -238600 - -resource.c -file - - - - -2009-11-15T17:17:07.000000Z -07f9767ab2ab4a7e342c81ba4480fe52 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -21910 - -workqueue.c -file - - - - -2009-11-15T17:17:07.000000Z -97ec27cdf954cbe9c988a8fd1b6f9650 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -25841 - -exit.c -file - - - - -2009-11-15T17:17:07.000000Z -44f244367a422cd8a0662b1e2b56cf4f -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -47038 - -sched_cpupri.h -file - - - - -2009-11-15T17:17:07.000000Z -db7408294ba0998b6919912799859f7c -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -936 - diff --git a/libdde_linux26/lib/src/kernel/.svn/format b/libdde_linux26/lib/src/kernel/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/kernel/.svn/prop-base/timeconst.pl.svn-base b/libdde_linux26/lib/src/kernel/.svn/prop-base/timeconst.pl.svn-base deleted file mode 100644 index 869ac71c..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/prop-base/timeconst.pl.svn-base +++ /dev/null @@ -1,5 +0,0 @@ -K 14 -svn:executable -V 1 -* -END diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/capability.c.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/capability.c.svn-base deleted file mode 100644 index c269aa7c..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/capability.c.svn-base +++ /dev/null @@ -1,323 +0,0 @@ -/* - * linux/kernel/capability.c - * - * Copyright (C) 1997 Andrew Main <zefram@fysh.org> - * - * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org> - * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> - */ - -#include <linux/audit.h> -#include <linux/capability.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/security.h> -#include <linux/syscalls.h> -#include <linux/pid_namespace.h> -#include <asm/uaccess.h> -#include "cred-internals.h" - -#ifndef DDE_LINUX -/* - * This lock protects task->cap_* for all tasks including current. - * Locking rule: acquire this prior to tasklist_lock. - */ -static DEFINE_SPINLOCK(task_capability_lock); - -/* - * Leveraged for setting/resetting capabilities - */ - -const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; -const kernel_cap_t __cap_full_set = CAP_FULL_SET; -const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; - -EXPORT_SYMBOL(__cap_empty_set); -EXPORT_SYMBOL(__cap_full_set); -EXPORT_SYMBOL(__cap_init_eff_set); - -#ifdef CONFIG_SECURITY_FILE_CAPABILITIES -int file_caps_enabled = 1; - -static int __init file_caps_disable(char *str) -{ - file_caps_enabled = 0; - return 1; -} -__setup("no_file_caps", file_caps_disable); -#endif - -/* - * More recent versions of libcap are available from: - * - * http://www.kernel.org/pub/linux/libs/security/linux-privs/ - */ - -static void warn_legacy_capability_use(void) -{ - static int warned; - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses 32-bit capabilities" - " (legacy support in use)\n", - get_task_comm(name, current)); - warned = 1; - } -} - -/* - * Version 2 capabilities worked fine, but the linux/capability.h file - * that accompanied their introduction encouraged their use without - * the necessary user-space source code changes. As such, we have - * created a version 3 with equivalent functionality to version 2, but - * with a header change to protect legacy source code from using - * version 2 when it wanted to use version 1. If your system has code - * that trips the following warning, it is using version 2 specific - * capabilities and may be doing so insecurely. - * - * The remedy is to either upgrade your version of libcap (to 2.10+, - * if the application is linked against it), or recompile your - * application with modern kernel headers and this warning will go - * away. - */ - -static void warn_deprecated_v2(void) -{ - static int warned; - - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses deprecated v2" - " capabilities in a way that may be insecure.\n", - get_task_comm(name, current)); - warned = 1; - } -} - -/* - * Version check. Return the number of u32s in each capability flag - * array, or a negative value on error. - */ -static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) -{ - __u32 version; - - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - *tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - warn_deprecated_v2(); - /* - * fall through - v3 is otherwise equivalent to v2. - */ - case _LINUX_CAPABILITY_VERSION_3: - *tocopy = _LINUX_CAPABILITY_U32S_3; - break; - default: - if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } - - return 0; -} - -/* - * The only thing that can change the capabilities of the current - * process is the current process. As such, we can't be in this code - * at the same time as we are in the process of setting capabilities - * in this process. The net result is that we can limit our use of - * locks to when we are reading the caps of another process. - */ -static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, - kernel_cap_t *pIp, kernel_cap_t *pPp) -{ - int ret; - - if (pid && (pid != task_pid_vnr(current))) { - struct task_struct *target; - - read_lock(&tasklist_lock); - - target = find_task_by_vpid(pid); - if (!target) - ret = -ESRCH; - else - ret = security_capget(target, pEp, pIp, pPp); - - read_unlock(&tasklist_lock); - } else - ret = security_capget(current, pEp, pIp, pPp); - - return ret; -} - -/** - * sys_capget - get the capabilities of a given process. - * @header: pointer to struct that contains capability version and - * target pid data - * @dataptr: pointer to struct that contains the effective, permitted, - * and inheritable capabilities that are returned - * - * Returns 0 on success and < 0 on error. - */ -SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) -{ - int ret = 0; - pid_t pid; - unsigned tocopy; - kernel_cap_t pE, pI, pP; - - ret = cap_validate_magic(header, &tocopy); - if (ret != 0) - return ret; - - if (get_user(pid, &header->pid)) - return -EFAULT; - - if (pid < 0) - return -EINVAL; - - ret = cap_get_target_pid(pid, &pE, &pI, &pP); - if (!ret) { - struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i; - - for (i = 0; i < tocopy; i++) { - kdata[i].effective = pE.cap[i]; - kdata[i].permitted = pP.cap[i]; - kdata[i].inheritable = pI.cap[i]; - } - - /* - * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, - * we silently drop the upper capabilities here. This - * has the effect of making older libcap - * implementations implicitly drop upper capability - * bits when they perform a: capget/modify/capset - * sequence. - * - * This behavior is considered fail-safe - * behavior. Upgrading the application to a newer - * version of libcap will enable access to the newer - * capabilities. - * - * An alternative would be to return an error here - * (-ERANGE), but that causes legacy applications to - * unexpectidly fail; the capget/modify/capset aborts - * before modification is attempted and the application - * fails. - */ - if (copy_to_user(dataptr, kdata, tocopy - * sizeof(struct __user_cap_data_struct))) { - return -EFAULT; - } - } - - return ret; -} - -/** - * sys_capset - set capabilities for a process or (*) a group of processes - * @header: pointer to struct that contains capability version and - * target pid data - * @data: pointer to struct that contains the effective, permitted, - * and inheritable capabilities - * - * Set capabilities for the current process only. The ability to any other - * process(es) has been deprecated and removed. - * - * The restrictions on setting capabilities are specified as: - * - * I: any raised capabilities must be a subset of the old permitted - * P: any raised capabilities must be a subset of the old permitted - * E: must be set to a subset of new permitted - * - * Returns 0 on success and < 0 on error. - */ -SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) -{ - struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i, tocopy; - kernel_cap_t inheritable, permitted, effective; - struct cred *new; - int ret; - pid_t pid; - - ret = cap_validate_magic(header, &tocopy); - if (ret != 0) - return ret; - - if (get_user(pid, &header->pid)) - return -EFAULT; - - /* may only affect current now */ - if (pid != 0 && pid != task_pid_vnr(current)) - return -EPERM; - - if (copy_from_user(&kdata, data, - tocopy * sizeof(struct __user_cap_data_struct))) - return -EFAULT; - - for (i = 0; i < tocopy; i++) { - effective.cap[i] = kdata[i].effective; - permitted.cap[i] = kdata[i].permitted; - inheritable.cap[i] = kdata[i].inheritable; - } - while (i < _KERNEL_CAPABILITY_U32S) { - effective.cap[i] = 0; - permitted.cap[i] = 0; - inheritable.cap[i] = 0; - i++; - } - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - ret = security_capset(new, current_cred(), - &effective, &inheritable, &permitted); - if (ret < 0) - goto error; - - audit_log_capset(pid, new, current_cred()); - - return commit_creds(new); - -error: - abort_creds(new); - return ret; -} -#endif /* !DDE_LINUX */ - -/** - * capable - Determine if the current task has a superior capability in effect - * @cap: The capability to be tested for - * - * Return true if the current task has the given superior capability currently - * available for use, false if not. - * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. - */ -int capable(int cap) -{ - if (unlikely(!cap_valid(cap))) { - printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); - BUG(); - } - - if (security_capable(cap) == 0) { - current->flags |= PF_SUPERPRIV; - return 1; - } - return 0; -} -EXPORT_SYMBOL(capable); diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/cred-internals.h.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/cred-internals.h.svn-base deleted file mode 100644 index 2dc4fc2d..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/cred-internals.h.svn-base +++ /dev/null @@ -1,21 +0,0 @@ -/* Internal credentials stuff - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -/* - * user.c - */ -static inline void sched_switch_user(struct task_struct *p) -{ -#ifdef CONFIG_USER_SCHED - sched_move_task(p); -#endif /* CONFIG_USER_SCHED */ -} - diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/exit.c.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/exit.c.svn-base deleted file mode 100644 index 703f9aab..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/exit.c.svn-base +++ /dev/null @@ -1,1850 +0,0 @@ -/* - * linux/kernel/exit.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/capability.h> -#include <linux/completion.h> -#include <linux/personality.h> -#include <linux/tty.h> -#include <linux/mnt_namespace.h> -#include <linux/iocontext.h> -#include <linux/key.h> -#include <linux/security.h> -#include <linux/cpu.h> -#include <linux/acct.h> -#include <linux/tsacct_kern.h> -#include <linux/file.h> -#include <linux/fdtable.h> -#include <linux/binfmts.h> -#include <linux/nsproxy.h> -#include <linux/pid_namespace.h> -#include <linux/ptrace.h> -#include <linux/profile.h> -#include <linux/mount.h> -#include <linux/proc_fs.h> -#include <linux/kthread.h> -#include <linux/mempolicy.h> -#include <linux/taskstats_kern.h> -#include <linux/delayacct.h> -#include <linux/freezer.h> -#include <linux/cgroup.h> -#include <linux/syscalls.h> -#include <linux/signal.h> -#include <linux/posix-timers.h> -#include <linux/cn_proc.h> -#include <linux/mutex.h> -#include <linux/futex.h> -#include <linux/pipe_fs_i.h> -#include <linux/audit.h> /* for audit_free() */ -#include <linux/resource.h> -#include <linux/blkdev.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/tracehook.h> -#include <linux/init_task.h> -#include <trace/sched.h> - -#include <asm/uaccess.h> -#include <asm/unistd.h> -#include <asm/pgtable.h> -#include <asm/mmu_context.h> -#include "cred-internals.h" - -DEFINE_TRACE(sched_process_free); -DEFINE_TRACE(sched_process_exit); -DEFINE_TRACE(sched_process_wait); - -#ifndef DDE_LINUX -static void exit_mm(struct task_struct * tsk); - -static inline int task_detached(struct task_struct *p) -{ - return p->exit_signal == -1; -} - -static void __unhash_process(struct task_struct *p) -{ - nr_threads--; - detach_pid(p, PIDTYPE_PID); - if (thread_group_leader(p)) { - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); - - list_del_rcu(&p->tasks); - __get_cpu_var(process_counts)--; - } - list_del_rcu(&p->thread_group); - list_del_init(&p->sibling); -} - -/* - * This function expects the tasklist_lock write-locked. - */ -static void __exit_signal(struct task_struct *tsk) -{ - struct signal_struct *sig = tsk->signal; - struct sighand_struct *sighand; - - BUG_ON(!sig); - BUG_ON(!atomic_read(&sig->count)); - - sighand = rcu_dereference(tsk->sighand); - spin_lock(&sighand->siglock); - - posix_cpu_timers_exit(tsk); - if (atomic_dec_and_test(&sig->count)) - posix_cpu_timers_exit_group(tsk); - else { - /* - * If there is any task waiting for the group exit - * then notify it: - */ - if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) - wake_up_process(sig->group_exit_task); - - if (tsk == sig->curr_target) - sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - sig->utime = cputime_add(sig->utime, task_utime(tsk)); - sig->stime = cputime_add(sig->stime, task_stime(tsk)); - sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; - sig = NULL; /* Marker for below. */ - } - - __unhash_process(tsk); - - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. - */ - flush_sigqueue(&tsk->pending); - - tsk->signal = NULL; - tsk->sighand = NULL; - spin_unlock(&sighand->siglock); - - __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - if (sig) { - flush_sigqueue(&sig->shared_pending); - taskstats_tgid_free(sig); - /* - * Make sure ->signal can't go away under rq->lock, - * see account_group_exec_runtime(). - */ - task_rq_unlock_wait(tsk); - __cleanup_signal(sig); - } -} - -static void delayed_put_task_struct(struct rcu_head *rhp) -{ - struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); - - trace_sched_process_free(tsk); - put_task_struct(tsk); -} - - -void release_task(struct task_struct * p) -{ - struct task_struct *leader; - int zap_leader; -repeat: - tracehook_prepare_release_task(p); - /* don't need to get the RCU readlock here - the process is dead and - * can't be modifying its own credentials */ - atomic_dec(&__task_cred(p)->user->processes); - - proc_flush_task(p); - write_lock_irq(&tasklist_lock); - tracehook_finish_release_task(p); - __exit_signal(p); - - /* - * If we are the last non-leader member of the thread - * group, and the leader is zombie, then notify the - * group leader's parent process. (if it wants notification.) - */ - zap_leader = 0; - leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { - BUG_ON(task_detached(leader)); - do_notify_parent(leader, leader->exit_signal); - /* - * If we were the last child thread and the leader has - * exited already, and the leader's parent ignores SIGCHLD, - * then we are the one who should release the leader. - * - * do_notify_parent() will have marked it self-reaping in - * that case. - */ - zap_leader = task_detached(leader); - - /* - * This maintains the invariant that release_task() - * only runs on a task in EXIT_DEAD, just for sanity. - */ - if (zap_leader) - leader->exit_state = EXIT_DEAD; - } - - write_unlock_irq(&tasklist_lock); - release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); - - p = leader; - if (unlikely(zap_leader)) - goto repeat; -} - -/* - * This checks not only the pgrp, but falls back on the pid if no - * satisfactory pgrp is found. I dunno - gdb doesn't work correctly - * without this... - * - * The caller must hold rcu lock or the tasklist lock. - */ -struct pid *session_of_pgrp(struct pid *pgrp) -{ - struct task_struct *p; - struct pid *sid = NULL; - - p = pid_task(pgrp, PIDTYPE_PGID); - if (p == NULL) - p = pid_task(pgrp, PIDTYPE_PID); - if (p != NULL) - sid = task_session(p); - - return sid; -} - -/* - * Determine if a process group is "orphaned", according to the POSIX - * definition in 2.2.2.52. Orphaned process groups are not to be affected - * by terminal-generated stop signals. Newly orphaned process groups are - * to receive a SIGHUP and a SIGCONT. - * - * "I ask you, have you ever known what it is to be an orphan?" - */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) -{ - struct task_struct *p; - - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if ((p == ignored_task) || - (p->exit_state && thread_group_empty(p)) || - is_global_init(p->real_parent)) - continue; - - if (task_pgrp(p->real_parent) != pgrp && - task_session(p->real_parent) == task_session(p)) - return 0; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - - return 1; -} - -int is_current_pgrp_orphaned(void) -{ - int retval; - - read_lock(&tasklist_lock); - retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); - read_unlock(&tasklist_lock); - - return retval; -} - -static int has_stopped_jobs(struct pid *pgrp) -{ - int retval = 0; - struct task_struct *p; - - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (!task_is_stopped(p)) - continue; - retval = 1; - break; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return retval; -} - -/* - * Check to see if any process groups have become orphaned as - * a result of our exiting, and if they have any stopped jobs, - * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ -static void -kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) -{ - struct pid *pgrp = task_pgrp(tsk); - struct task_struct *ignored_task = tsk; - - if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ - parent = tsk->real_parent; - else - /* reparent: our child is in a different pgrp than - * we are, and it was the only connection outside. - */ - ignored_task = NULL; - - if (task_pgrp(parent) != pgrp && - task_session(parent) == task_session(tsk) && - will_become_orphaned_pgrp(pgrp, ignored_task) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } -} - -/** - * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to kthreadd so it - * isn't in the way of other processes and is correctly cleaned up on exit. - * - * The various task state such as scheduling policy and priority may have - * been inherited from a user process, so we reset them to sane values here. - * - * NOTE that reparent_to_kthreadd() gives the caller full capabilities. - */ -static void reparent_to_kthreadd(void) -{ - write_lock_irq(&tasklist_lock); - - ptrace_unlink(current); - /* Reparent to init */ - current->real_parent = current->parent = kthreadd_task; - list_move_tail(¤t->sibling, ¤t->real_parent->children); - - /* Set the exit signal to SIGCHLD so we signal init on exit */ - current->exit_signal = SIGCHLD; - - if (task_nice(current) < 0) - set_user_nice(current, 0); - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - memcpy(current->signal->rlim, init_task.signal->rlim, - sizeof(current->signal->rlim)); - -#ifndef DDE_LINUX - atomic_inc(&init_cred.usage); - commit_creds(&init_cred); -#endif - write_unlock_irq(&tasklist_lock); -} - -void __set_special_pids(struct pid *pid) -{ - struct task_struct *curr = current->group_leader; - pid_t nr = pid_nr(pid); - - if (task_session(curr) != pid) { - change_pid(curr, PIDTYPE_SID, pid); - set_task_session(curr, nr); - } - if (task_pgrp(curr) != pid) { - change_pid(curr, PIDTYPE_PGID, pid); - set_task_pgrp(curr, nr); - } -} - -static void set_special_pids(struct pid *pid) -{ - write_lock_irq(&tasklist_lock); - __set_special_pids(pid); - write_unlock_irq(&tasklist_lock); -} - -/* - * Let kernel threads use this to say that they - * allow a certain signal (since daemonize() will - * have disabled all of them by default). - */ -int allow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - sigdelset(¤t->blocked, sig); - if (!current->mm) { - /* Kernel threads handle their own signals. - Let the signal code know it'll be handled, so - that they don't get converted to SIGKILL or - just silently dropped */ - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - } - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(allow_signal); - -int disallow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(disallow_signal); - -/* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. - */ - -void daemonize(const char *name, ...) -{ - va_list args; - struct fs_struct *fs; - sigset_t blocked; - - va_start(args, name); - vsnprintf(current->comm, sizeof(current->comm), name, args); - va_end(args); - - /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. - */ - exit_mm(current); - /* - * We don't want to have TIF_FREEZE set if the system-wide hibernation - * or suspend transition begins right now. - */ - current->flags |= (PF_NOFREEZE | PF_KTHREAD); - - if (current->nsproxy != &init_nsproxy) { - get_nsproxy(&init_nsproxy); - switch_task_namespaces(current, &init_nsproxy); - } - set_special_pids(&init_struct_pid); - proc_clear_tty(current); - - /* Block and flush all signals */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - - /* Become as one with the init task */ - - exit_fs(current); /* current->fs->count--; */ - fs = init_task.fs; - current->fs = fs; - atomic_inc(&fs->count); - - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); - - reparent_to_kthreadd(); -} - -EXPORT_SYMBOL(daemonize); - -static void close_files(struct files_struct * files) -{ - int i, j; - struct fdtable *fdt; - - j = 0; - - /* - * It is safe to dereference the fd table without RCU or - * ->file_lock because this is the last reference to the - * files structure. - */ - fdt = files_fdtable(files); - for (;;) { - unsigned long set; - i = j * __NFDBITS; - if (i >= fdt->max_fds) - break; - set = fdt->open_fds->fds_bits[j++]; - while (set) { - if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); - if (file) { - filp_close(file, files); - cond_resched(); - } - } - i++; - set >>= 1; - } - } -} - -struct files_struct *get_files_struct(struct task_struct *task) -{ - struct files_struct *files; - - task_lock(task); - files = task->files; - if (files) - atomic_inc(&files->count); - task_unlock(task); - - return files; -} - -void put_files_struct(struct files_struct *files) -{ - struct fdtable *fdt; - - if (atomic_dec_and_test(&files->count)) { - close_files(files); - /* - * Free the fd and fdset arrays if we expanded them. - * If the fdtable was embedded, pass files for freeing - * at the end of the RCU grace period. Otherwise, - * you can free files immediately. - */ - fdt = files_fdtable(files); - if (fdt != &files->fdtab) - kmem_cache_free(files_cachep, files); - free_fdtable(fdt); - } -} - -void reset_files_struct(struct files_struct *files) -{ - struct task_struct *tsk = current; - struct files_struct *old; - - old = tsk->files; - task_lock(tsk); - tsk->files = files; - task_unlock(tsk); - put_files_struct(old); -} - -void exit_files(struct task_struct *tsk) -{ - struct files_struct * files = tsk->files; - - if (files) { - task_lock(tsk); - tsk->files = NULL; - task_unlock(tsk); - put_files_struct(files); - } -} - -void put_fs_struct(struct fs_struct *fs) -{ - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { - path_put(&fs->root); - path_put(&fs->pwd); - kmem_cache_free(fs_cachep, fs); - } -} - -void exit_fs(struct task_struct *tsk) -{ - struct fs_struct * fs = tsk->fs; - - if (fs) { - task_lock(tsk); - tsk->fs = NULL; - task_unlock(tsk); - put_fs_struct(fs); - } -} - -EXPORT_SYMBOL_GPL(exit_fs); - -#ifdef CONFIG_MM_OWNER -/* - * Task p is exiting and it owned mm, lets find a new owner for it - */ -static inline int -mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) -{ - /* - * If there are other users of the mm and the owner (us) is exiting - * we need to find a new owner to take on the responsibility. - */ - if (atomic_read(&mm->mm_users) <= 1) - return 0; - if (mm->owner != p) - return 0; - return 1; -} - -void mm_update_next_owner(struct mm_struct *mm) -{ - struct task_struct *c, *g, *p = current; - -retry: - if (!mm_need_new_owner(mm, p)) - return; - - read_lock(&tasklist_lock); - /* - * Search in the children - */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; - } - - /* - * Search in the siblings - */ - list_for_each_entry(c, &p->parent->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; - } - - /* - * Search through everything else. We should not get - * here often - */ - do_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - } while_each_thread(g, c); - - read_unlock(&tasklist_lock); - /* - * We found no owner yet mm_users > 1: this implies that we are - * most likely racing with swapoff (try_to_unuse()) or /proc or - * ptrace or page migration (get_task_mm()). Mark owner as NULL. - */ - mm->owner = NULL; - return; - -assign_new_owner: - BUG_ON(c == p); - get_task_struct(c); - /* - * The task_lock protects c->mm from changing. - * We always want mm->owner->mm == mm - */ - task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); - if (c->mm != mm) { - task_unlock(c); - put_task_struct(c); - goto retry; - } - mm->owner = c; - task_unlock(c); - put_task_struct(c); -} -#endif /* CONFIG_MM_OWNER */ - -/* - * Turn us into a lazy TLB process if we - * aren't already.. - */ -static void exit_mm(struct task_struct * tsk) -{ - struct mm_struct *mm = tsk->mm; - struct core_state *core_state; - - mm_release(tsk, mm); - if (!mm) - return; - /* - * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_state - * and clearing tsk->mm. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group with ->mm != NULL. - */ - down_read(&mm->mmap_sem); - core_state = mm->core_state; - if (core_state) { - struct core_thread self; - up_read(&mm->mmap_sem); - - self.task = tsk; - self.next = xchg(&core_state->dumper.next, &self); - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); - - for (;;) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!self.task) /* see coredump_finish() */ - break; - schedule(); - } - __set_task_state(tsk, TASK_RUNNING); - down_read(&mm->mmap_sem); - } - atomic_inc(&mm->mm_count); - BUG_ON(mm != tsk->active_mm); - /* more a memory barrier than a real lock */ - task_lock(tsk); - tsk->mm = NULL; - up_read(&mm->mmap_sem); - enter_lazy_tlb(mm, current); - /* We don't want this task to be frozen prematurely */ - clear_freeze_flag(tsk); - task_unlock(tsk); - mm_update_next_owner(mm); - mmput(mm); -} - -/* - * Return nonzero if @parent's children should reap themselves. - * - * Called with write_lock_irq(&tasklist_lock) held. - */ -static int ignoring_children(struct task_struct *parent) -{ - int ret; - struct sighand_struct *psig = parent->sighand; - unsigned long flags; - spin_lock_irqsave(&psig->siglock, flags); - ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || - (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT)); - spin_unlock_irqrestore(&psig->siglock, flags); - return ret; -} - -/* - * Detach all tasks we were using ptrace on. - * Any that need to be release_task'd are put on the @dead list. - * - * Called with write_lock(&tasklist_lock) held. - */ -static void ptrace_exit(struct task_struct *parent, struct list_head *dead) -{ - struct task_struct *p, *n; - int ign = -1; - - list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { - __ptrace_unlink(p); - - if (p->exit_state != EXIT_ZOMBIE) - continue; - - /* - * If it's a zombie, our attachedness prevented normal - * parent notification or self-reaping. Do notification - * now if it would have happened earlier. If it should - * reap itself, add it to the @dead list. We can't call - * release_task() here because we already hold tasklist_lock. - * - * If it's our own child, there is no notification to do. - * But if our normal children self-reap, then this child - * was prevented by ptrace and we must reap it now. - */ - if (!task_detached(p) && thread_group_empty(p)) { - if (!same_thread_group(p->real_parent, parent)) - do_notify_parent(p, p->exit_signal); - else { - if (ign < 0) - ign = ignoring_children(parent); - if (ign) - p->exit_signal = -1; - } - } - - if (task_detached(p)) { - /* - * Mark it as in the process of being reaped. - */ - p->exit_state = EXIT_DEAD; - list_add(&p->ptrace_entry, dead); - } - } -} - -/* - * Finish up exit-time ptrace cleanup. - * - * Called without locks. - */ -static void ptrace_exit_finish(struct task_struct *parent, - struct list_head *dead) -{ - struct task_struct *p, *n; - - BUG_ON(!list_empty(&parent->ptraced)); - - list_for_each_entry_safe(p, n, dead, ptrace_entry) { - list_del_init(&p->ptrace_entry); - release_task(p); - } -} - -static void reparent_thread(struct task_struct *p, struct task_struct *father) -{ - if (p->pdeath_signal) - /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); - - list_move_tail(&p->sibling, &p->real_parent->children); - - /* If this is a threaded reparent there is no need to - * notify anyone anything has happened. - */ - if (same_thread_group(p->real_parent, father)) - return; - - /* We don't want people slaying init. */ - if (!task_detached(p)) - p->exit_signal = SIGCHLD; - - /* If we'd notified the old parent about this child's death, - * also notify the new parent. - */ - if (!ptrace_reparented(p) && - p->exit_state == EXIT_ZOMBIE && - !task_detached(p) && thread_group_empty(p)) - do_notify_parent(p, p->exit_signal); - - kill_orphaned_pgrp(p, father); -} - -/* - * When we die, we re-parent all our children. - * Try to give them to another thread in our thread - * group, and if no such member exists, give it to - * the child reaper process (ie "init") in our pid - * space. - */ -static struct task_struct *find_new_reaper(struct task_struct *father) -{ - struct pid_namespace *pid_ns = task_active_pid_ns(father); - struct task_struct *thread; - - thread = father; - while_each_thread(father, thread) { - if (thread->flags & PF_EXITING) - continue; - if (unlikely(pid_ns->child_reaper == father)) - pid_ns->child_reaper = thread; - return thread; - } - - if (unlikely(pid_ns->child_reaper == father)) { - write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) - panic("Attempted to kill init!"); - - zap_pid_ns_processes(pid_ns); - write_lock_irq(&tasklist_lock); - /* - * We can not clear ->child_reaper or leave it alone. - * There may by stealth EXIT_DEAD tasks on ->children, - * forget_original_parent() must move them somewhere. - */ - pid_ns->child_reaper = init_pid_ns.child_reaper; - } - - return pid_ns->child_reaper; -} - -static void forget_original_parent(struct task_struct *father) -{ - struct task_struct *p, *n, *reaper; - LIST_HEAD(ptrace_dead); - - write_lock_irq(&tasklist_lock); - reaper = find_new_reaper(father); - /* - * First clean up ptrace if we were using it. - */ - ptrace_exit(father, &ptrace_dead); - - list_for_each_entry_safe(p, n, &father->children, sibling) { - p->real_parent = reaper; - if (p->parent == father) { - BUG_ON(p->ptrace); - p->parent = p->real_parent; - } - reparent_thread(p, father); - } - - write_unlock_irq(&tasklist_lock); - BUG_ON(!list_empty(&father->children)); - - ptrace_exit_finish(father, &ptrace_dead); -} - -/* - * Send signals to all our closest relatives so that they know - * to properly mourn us.. - */ -static void exit_notify(struct task_struct *tsk, int group_dead) -{ - int signal; - void *cookie; - - /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ - forget_original_parent(tsk); - exit_task_namespaces(tsk); - - write_lock_irq(&tasklist_lock); - if (group_dead) - kill_orphaned_pgrp(tsk->group_leader, NULL); - - /* Let father know we died - * - * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitary processes. - * That stops right now. - * - * If the parent exec id doesn't match the exec id we saved - * when we started then we know the parent has changed security - * domain. - * - * If our self_exec id doesn't match our parent_exec_id then - * we have changed execution domain as these two values started - * the same after a fork. - */ - if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && - (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id) && - !capable(CAP_KILL)) - tsk->exit_signal = SIGCHLD; - - signal = tracehook_notify_death(tsk, &cookie, group_dead); - if (signal >= 0) - signal = do_notify_parent(tsk, signal); - - tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; - - /* mt-exec, de_thread() is waiting for us */ - if (thread_group_leader(tsk) && - tsk->signal->group_exit_task && - tsk->signal->notify_count < 0) - wake_up_process(tsk->signal->group_exit_task); - - write_unlock_irq(&tasklist_lock); - - tracehook_report_death(tsk, signal, cookie, group_dead); - - /* If the process is dead, release it - nobody will wait for it */ - if (signal == DEATH_REAP) - release_task(tsk); -} - -#ifdef CONFIG_DEBUG_STACK_USAGE -static void check_stack_usage(void) -{ - static DEFINE_SPINLOCK(low_water_lock); - static int lowest_to_date = THREAD_SIZE; - unsigned long *n = end_of_stack(current); - unsigned long free; - - while (*n == 0) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(current); - - if (free >= lowest_to_date) - return; - - spin_lock(&low_water_lock); - if (free < lowest_to_date) { - printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " - "left\n", - current->comm, free); - lowest_to_date = free; - } - spin_unlock(&low_water_lock); -} -#else -static inline void check_stack_usage(void) {} -#endif - -NORET_TYPE void do_exit(long code) -{ - struct task_struct *tsk = current; - int group_dead; - - profile_task_exit(tsk); - - WARN_ON(atomic_read(&tsk->fs_excl)); - - if (unlikely(in_interrupt())) - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) - panic("Attempted to kill the idle task!"); - - tracehook_report_exit(&code); - - /* - * We're taking recursive faults here in do_exit. Safest is to just - * leave this task alone and wait for reboot. - */ - if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); - /* - * We can do this unlocked here. The futex code uses - * this flag just to verify whether the pi state - * cleanup has been done or not. In the worst case it - * loops once more. We pretend that the cleanup was - * done as there is no way to return. Either the - * OWNER_DIED bit is set by now or we push the blocked - * task into the wait for ever nirwana as well. - */ - tsk->flags |= PF_EXITPIDONE; - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - } - - exit_signals(tsk); /* sets PF_EXITING */ - /* - * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. - */ - smp_mb(); - spin_unlock_wait(&tsk->pi_lock); - - if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - - acct_update_integrals(tsk); - - group_dead = atomic_dec_and_test(&tsk->signal->live); - if (group_dead) { - hrtimer_cancel(&tsk->signal->real_timer); - exit_itimers(tsk->signal); - } - acct_collect(code, group_dead); - if (group_dead) - tty_audit_exit(); - if (unlikely(tsk->audit_context)) - audit_free(tsk); - - tsk->exit_code = code; - taskstats_exit(tsk, group_dead); - - exit_mm(tsk); - - if (group_dead) - acct_process(); - trace_sched_process_exit(tsk); - - exit_sem(tsk); - exit_files(tsk); - exit_fs(tsk); - check_stack_usage(); - exit_thread(); - cgroup_exit(tsk, 1); - - if (group_dead && tsk->signal->leader) - disassociate_ctty(1); - - module_put(task_thread_info(tsk)->exec_domain->module); - if (tsk->binfmt) - module_put(tsk->binfmt->module); - - proc_exit_connector(tsk); - exit_notify(tsk, group_dead); -#ifdef CONFIG_NUMA - mpol_put(tsk->mempolicy); - tsk->mempolicy = NULL; -#endif -#ifdef CONFIG_FUTEX - /* - * This must happen late, after the PID is not - * hashed anymore: - */ - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); - if (unlikely(current->pi_state_cache)) - kfree(current->pi_state_cache); -#endif - /* - * Make sure we are holding no locks: - */ - debug_check_no_locks_held(tsk); - /* - * We can do this unlocked here. The futex code uses this flag - * just to verify whether the pi state cleanup has been done - * or not. In the worst case it loops once more. - */ - tsk->flags |= PF_EXITPIDONE; - - if (tsk->io_context) - exit_io_context(); - - if (tsk->splice_pipe) - __free_pipe_info(tsk->splice_pipe); - - preempt_disable(); - /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = TASK_DEAD; - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ -} - -EXPORT_SYMBOL_GPL(do_exit); - -#endif /* !DDE_LINUX */ - -NORET_TYPE void complete_and_exit(struct completion *comp, long code) -{ - if (comp) - complete(comp); - - do_exit(code); -} - -EXPORT_SYMBOL(complete_and_exit); - -#ifndef DDE_LINUX -SYSCALL_DEFINE1(exit, int, error_code) -{ - do_exit((error_code&0xff)<<8); -} - -/* - * Take down every thread in the group. This is called by fatal signals - * as well as by sys_exit_group (below). - */ -NORET_TYPE void -do_group_exit(int exit_code) -{ - struct signal_struct *sig = current->signal; - - BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - - if (signal_group_exit(sig)) - exit_code = sig->group_exit_code; - else if (!thread_group_empty(current)) { - struct sighand_struct *const sighand = current->sighand; - spin_lock_irq(&sighand->siglock); - if (signal_group_exit(sig)) - /* Another thread got here before we took the lock. */ - exit_code = sig->group_exit_code; - else { - sig->group_exit_code = exit_code; - sig->flags = SIGNAL_GROUP_EXIT; - zap_other_threads(current); - } - spin_unlock_irq(&sighand->siglock); - } - - do_exit(exit_code); - /* NOTREACHED */ -} - -/* - * this kills every thread in the thread group. Note that any externally - * wait4()-ing process will get the correct exit code - even if this - * thread is not the thread group leader. - */ -SYSCALL_DEFINE1(exit_group, int, error_code) -{ - do_group_exit((error_code & 0xff) << 8); - /* NOTREACHED */ - return 0; -} - -static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) -{ - struct pid *pid = NULL; - if (type == PIDTYPE_PID) - pid = task->pids[type].pid; - else if (type < PIDTYPE_MAX) - pid = task->group_leader->pids[type].pid; - return pid; -} - -static int eligible_child(enum pid_type type, struct pid *pid, int options, - struct task_struct *p) -{ - int err; - - if (type < PIDTYPE_MAX) { - if (task_pid_type(p, type) != pid) - return 0; - } - - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) - && !(options & __WALL)) - return 0; - - err = security_task_wait(p); - if (err) - return err; - - return 1; -} - -static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, - int why, int status, - struct siginfo __user *infop, - struct rusage __user *rusagep) -{ - int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; - - put_task_struct(p); - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); - if (!retval) - retval = pid; - return retval; -} - -/* - * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold - * the lock and this task is uninteresting. If we return nonzero, we have - * released the lock and the system call should return. - */ -static int wait_task_zombie(struct task_struct *p, int options, - struct siginfo __user *infop, - int __user *stat_addr, struct rusage __user *ru) -{ - unsigned long state; - int retval, status, traced; - pid_t pid = task_pid_vnr(p); - uid_t uid = __task_cred(p)->uid; - - if (!likely(options & WEXITED)) - return 0; - - if (unlikely(options & WNOWAIT)) { - int exit_code = p->exit_code; - int why, status; - - get_task_struct(p); - read_unlock(&tasklist_lock); - if ((exit_code & 0x7f) == 0) { - why = CLD_EXITED; - status = exit_code >> 8; - } else { - why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; - status = exit_code & 0x7f; - } - return wait_noreap_copyout(p, pid, uid, why, - status, infop, ru); - } - - /* - * Try to move the task's state to DEAD - * only one thread is allowed to do this: - */ - state = xchg(&p->exit_state, EXIT_DEAD); - if (state != EXIT_ZOMBIE) { - BUG_ON(state != EXIT_DEAD); - return 0; - } - - traced = ptrace_reparented(p); - - if (likely(!traced)) { - struct signal_struct *psig; - struct signal_struct *sig; - struct task_cputime cputime; - - /* - * The resource counters for the group leader are in its - * own task_struct. Those for dead threads in the group - * are in its signal_struct, as are those for the child - * processes it has previously reaped. All these - * accumulate in the parent's signal_struct c* fields. - * - * We don't bother to take a lock here to protect these - * p->signal fields, because they are only touched by - * __exit_signal, which runs with tasklist_lock - * write-locked anyway, and so is excluded here. We do - * need to protect the access to p->parent->signal fields, - * as other threads in the parent group can be right - * here reaping other children at the same time. - * - * We use thread_group_cputime() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. - */ - thread_group_cputime(p, &cputime); - spin_lock_irq(&p->parent->sighand->siglock); - psig = p->parent->signal; - sig = p->signal; - psig->cutime = - cputime_add(psig->cutime, - cputime_add(cputime.utime, - sig->cutime)); - psig->cstime = - cputime_add(psig->cstime, - cputime_add(cputime.stime, - sig->cstime)); - psig->cgtime = - cputime_add(psig->cgtime, - cputime_add(p->gtime, - cputime_add(sig->gtime, - sig->cgtime))); - psig->cmin_flt += - p->min_flt + sig->min_flt + sig->cmin_flt; - psig->cmaj_flt += - p->maj_flt + sig->maj_flt + sig->cmaj_flt; - psig->cnvcsw += - p->nvcsw + sig->nvcsw + sig->cnvcsw; - psig->cnivcsw += - p->nivcsw + sig->nivcsw + sig->cnivcsw; - psig->cinblock += - task_io_get_inblock(p) + - sig->inblock + sig->cinblock; - psig->coublock += - task_io_get_oublock(p) + - sig->oublock + sig->coublock; - task_io_accounting_add(&psig->ioac, &p->ioac); - task_io_accounting_add(&psig->ioac, &sig->ioac); - spin_unlock_irq(&p->parent->sighand->siglock); - } - - /* - * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to EXIT_DEAD. - */ - read_unlock(&tasklist_lock); - - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; - status = (p->signal->flags & SIGNAL_GROUP_EXIT) - ? p->signal->group_exit_code : p->exit_code; - if (!retval && stat_addr) - retval = put_user(status, stat_addr); - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) { - int why; - - if ((status & 0x7f) == 0) { - why = CLD_EXITED; - status >>= 8; - } else { - why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; - status &= 0x7f; - } - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(status, &infop->si_status); - } - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; - - if (traced) { - write_lock_irq(&tasklist_lock); - /* We dropped tasklist, ptracer could die and untrace */ - ptrace_unlink(p); - /* - * If this is not a detached task, notify the parent. - * If it's still not detached after that, don't release - * it now. - */ - if (!task_detached(p)) { - do_notify_parent(p, p->exit_signal); - if (!task_detached(p)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } - } - write_unlock_irq(&tasklist_lock); - } - if (p != NULL) - release_task(p); - - return retval; -} - -/* - * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold - * the lock and this task is uninteresting. If we return nonzero, we have - * released the lock and the system call should return. - */ -static int wait_task_stopped(int ptrace, struct task_struct *p, - int options, struct siginfo __user *infop, - int __user *stat_addr, struct rusage __user *ru) -{ - int retval, exit_code, why; - uid_t uid = 0; /* unneeded, required by compiler */ - pid_t pid; - - if (!(options & WUNTRACED)) - return 0; - - exit_code = 0; - spin_lock_irq(&p->sighand->siglock); - - if (unlikely(!task_is_stopped_or_traced(p))) - goto unlock_sig; - - if (!ptrace && p->signal->group_stop_count > 0) - /* - * A group stop is in progress and this is the group leader. - * We won't report until all threads have stopped. - */ - goto unlock_sig; - - exit_code = p->exit_code; - if (!exit_code) - goto unlock_sig; - - if (!unlikely(options & WNOWAIT)) - p->exit_code = 0; - - /* don't need the RCU readlock here as we're holding a spinlock */ - uid = __task_cred(p)->uid; -unlock_sig: - spin_unlock_irq(&p->sighand->siglock); - if (!exit_code) - return 0; - - /* - * Now we are pretty sure this task is interesting. - * Make sure it doesn't get reaped out from under us while we - * give up the lock and then examine it below. We don't want to - * keep holding onto the tasklist_lock while we call getrusage and - * possibly take page faults for user memory. - */ - get_task_struct(p); - pid = task_pid_vnr(p); - why = ptrace ? CLD_TRAPPED : CLD_STOPPED; - read_unlock(&tasklist_lock); - - if (unlikely(options & WNOWAIT)) - return wait_noreap_copyout(p, pid, uid, - why, exit_code, - infop, ru); - - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; - if (!retval && stat_addr) - retval = put_user((exit_code << 8) | 0x7f, stat_addr); - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) - retval = put_user((short)why, &infop->si_code); - if (!retval && infop) - retval = put_user(exit_code, &infop->si_status); - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; - put_task_struct(p); - - BUG_ON(!retval); - return retval; -} - -/* - * Handle do_wait work for one task in a live, non-stopped state. - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold - * the lock and this task is uninteresting. If we return nonzero, we have - * released the lock and the system call should return. - */ -static int wait_task_continued(struct task_struct *p, int options, - struct siginfo __user *infop, - int __user *stat_addr, struct rusage __user *ru) -{ - int retval; - pid_t pid; - uid_t uid; - - if (!unlikely(options & WCONTINUED)) - return 0; - - if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) - return 0; - - spin_lock_irq(&p->sighand->siglock); - /* Re-check with the lock held. */ - if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { - spin_unlock_irq(&p->sighand->siglock); - return 0; - } - if (!unlikely(options & WNOWAIT)) - p->signal->flags &= ~SIGNAL_STOP_CONTINUED; - uid = __task_cred(p)->uid; - spin_unlock_irq(&p->sighand->siglock); - - pid = task_pid_vnr(p); - get_task_struct(p); - read_unlock(&tasklist_lock); - - if (!infop) { - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; - put_task_struct(p); - if (!retval && stat_addr) - retval = put_user(0xffff, stat_addr); - if (!retval) - retval = pid; - } else { - retval = wait_noreap_copyout(p, pid, uid, - CLD_CONTINUED, SIGCONT, - infop, ru); - BUG_ON(retval == 0); - } - - return retval; -} - -/* - * Consider @p for a wait by @parent. - * - * -ECHILD should be in *@notask_error before the first call. - * Returns nonzero for a final return, when we have unlocked tasklist_lock. - * Returns zero if the search for a child should continue; - * then *@notask_error is 0 if @p is an eligible child, - * or another error from security_task_wait(), or still -ECHILD. - */ -static int wait_consider_task(struct task_struct *parent, int ptrace, - struct task_struct *p, int *notask_error, - enum pid_type type, struct pid *pid, int options, - struct siginfo __user *infop, - int __user *stat_addr, struct rusage __user *ru) -{ - int ret = eligible_child(type, pid, options, p); - if (!ret) - return ret; - - if (unlikely(ret < 0)) { - /* - * If we have not yet seen any eligible child, - * then let this error code replace -ECHILD. - * A permission error will give the user a clue - * to look for security policy problems, rather - * than for mysterious wait bugs. - */ - if (*notask_error) - *notask_error = ret; - } - - if (likely(!ptrace) && unlikely(p->ptrace)) { - /* - * This child is hidden by ptrace. - * We aren't allowed to see it now, but eventually we will. - */ - *notask_error = 0; - return 0; - } - - if (p->exit_state == EXIT_DEAD) - return 0; - - /* - * We don't reap group leaders with subthreads. - */ - if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) - return wait_task_zombie(p, options, infop, stat_addr, ru); - - /* - * It's stopped or running now, so it might - * later continue, exit, or stop again. - */ - *notask_error = 0; - - if (task_is_stopped_or_traced(p)) - return wait_task_stopped(ptrace, p, options, - infop, stat_addr, ru); - - return wait_task_continued(p, options, infop, stat_addr, ru); -} - -/* - * Do the work of do_wait() for one thread in the group, @tsk. - * - * -ECHILD should be in *@notask_error before the first call. - * Returns nonzero for a final return, when we have unlocked tasklist_lock. - * Returns zero if the search for a child should continue; then - * *@notask_error is 0 if there were any eligible children, - * or another error from security_task_wait(), or still -ECHILD. - */ -static int do_wait_thread(struct task_struct *tsk, int *notask_error, - enum pid_type type, struct pid *pid, int options, - struct siginfo __user *infop, int __user *stat_addr, - struct rusage __user *ru) -{ - struct task_struct *p; - - list_for_each_entry(p, &tsk->children, sibling) { - /* - * Do not consider detached threads. - */ - if (!task_detached(p)) { - int ret = wait_consider_task(tsk, 0, p, notask_error, - type, pid, options, - infop, stat_addr, ru); - if (ret) - return ret; - } - } - - return 0; -} - -static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, - enum pid_type type, struct pid *pid, int options, - struct siginfo __user *infop, int __user *stat_addr, - struct rusage __user *ru) -{ - struct task_struct *p; - - /* - * Traditionally we see ptrace'd stopped tasks regardless of options. - */ - options |= WUNTRACED; - - list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { - int ret = wait_consider_task(tsk, 1, p, notask_error, - type, pid, options, - infop, stat_addr, ru); - if (ret) - return ret; - } - - return 0; -} - -static long do_wait(enum pid_type type, struct pid *pid, int options, - struct siginfo __user *infop, int __user *stat_addr, - struct rusage __user *ru) -{ - DECLARE_WAITQUEUE(wait, current); - struct task_struct *tsk; - int retval; - - trace_sched_process_wait(pid); - - add_wait_queue(¤t->signal->wait_chldexit,&wait); -repeat: - /* - * If there is nothing that can match our critiera just get out. - * We will clear @retval to zero if we see any child that might later - * match our criteria, even if we are not able to reap it yet. - */ - retval = -ECHILD; - if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) - goto end; - - current->state = TASK_INTERRUPTIBLE; - read_lock(&tasklist_lock); - tsk = current; - do { - int tsk_result = do_wait_thread(tsk, &retval, - type, pid, options, - infop, stat_addr, ru); - if (!tsk_result) - tsk_result = ptrace_do_wait(tsk, &retval, - type, pid, options, - infop, stat_addr, ru); - if (tsk_result) { - /* - * tasklist_lock is unlocked and we have a final result. - */ - retval = tsk_result; - goto end; - } - - if (options & __WNOTHREAD) - break; - tsk = next_thread(tsk); - BUG_ON(tsk->signal != current->signal); - } while (tsk != current); - read_unlock(&tasklist_lock); - - if (!retval && !(options & WNOHANG)) { - retval = -ERESTARTSYS; - if (!signal_pending(current)) { - schedule(); - goto repeat; - } - } - -end: - current->state = TASK_RUNNING; - remove_wait_queue(¤t->signal->wait_chldexit,&wait); - if (infop) { - if (retval > 0) - retval = 0; - else { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!retval) - retval = put_user(0, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user(0, &infop->si_code); - if (!retval) - retval = put_user(0, &infop->si_pid); - if (!retval) - retval = put_user(0, &infop->si_uid); - if (!retval) - retval = put_user(0, &infop->si_status); - } - } - return retval; -} - -SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, - infop, int, options, struct rusage __user *, ru) -{ - struct pid *pid = NULL; - enum pid_type type; - long ret; - - if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) - return -EINVAL; - if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) - return -EINVAL; - - switch (which) { - case P_ALL: - type = PIDTYPE_MAX; - break; - case P_PID: - type = PIDTYPE_PID; - if (upid <= 0) - return -EINVAL; - break; - case P_PGID: - type = PIDTYPE_PGID; - if (upid <= 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - - if (type < PIDTYPE_MAX) - pid = find_get_pid(upid); - ret = do_wait(type, pid, options, infop, NULL, ru); - put_pid(pid); - - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(5, ret, which, upid, infop, options, ru); - return ret; -} - -SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, - int, options, struct rusage __user *, ru) -{ - struct pid *pid = NULL; - enum pid_type type; - long ret; - - if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| - __WNOTHREAD|__WCLONE|__WALL)) - return -EINVAL; - - if (upid == -1) - type = PIDTYPE_MAX; - else if (upid < 0) { - type = PIDTYPE_PGID; - pid = find_get_pid(-upid); - } else if (upid == 0) { - type = PIDTYPE_PGID; - pid = get_pid(task_pgrp(current)); - } else /* upid > 0 */ { - type = PIDTYPE_PID; - pid = find_get_pid(upid); - } - - ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru); - put_pid(pid); - - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(4, ret, upid, stat_addr, options, ru); - return ret; -} - -#ifdef __ARCH_WANT_SYS_WAITPID - -/* - * sys_waitpid() remains for compatibility. waitpid() should be - * implemented by calling sys_wait4() from libc.a. - */ -SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) -{ - return sys_wait4(pid, stat_addr, options, NULL); -} - -#endif -#endif diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/resource.c.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/resource.c.svn-base deleted file mode 100644 index 3dd07a35..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/resource.c.svn-base +++ /dev/null @@ -1,936 +0,0 @@ -/* - * linux/kernel/resource.c - * - * Copyright (C) 1999 Linus Torvalds - * Copyright (C) 1999 Martin Mares <mj@ucw.cz> - * - * Arbitrary resource management. - */ - -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/ioport.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/fs.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/device.h> -#include <linux/pfn.h> -#include <asm/io.h> - - -struct resource ioport_resource = { - .name = "PCI IO", - .start = 0, - .end = IO_SPACE_LIMIT, - .flags = IORESOURCE_IO, -}; -EXPORT_SYMBOL(ioport_resource); - -struct resource iomem_resource = { - .name = "PCI mem", - .start = 0, - .end = -1, - .flags = IORESOURCE_MEM, -}; -EXPORT_SYMBOL(iomem_resource); - -static DEFINE_RWLOCK(resource_lock); - -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct resource *p = v; - (*pos)++; - if (p->child) - return p->child; - while (!p->sibling && p->parent) - p = p->parent; - return p->sibling; -} - -#ifdef CONFIG_PROC_FS - -enum { MAX_IORES_LEVEL = 5 }; - -static void *r_start(struct seq_file *m, loff_t *pos) - __acquires(resource_lock) -{ - struct resource *p = m->private; - loff_t l = 0; - read_lock(&resource_lock); - for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) - ; - return p; -} - -static void r_stop(struct seq_file *m, void *v) - __releases(resource_lock) -{ - read_unlock(&resource_lock); -} - -static int r_show(struct seq_file *m, void *v) -{ - struct resource *root = m->private; - struct resource *r = v, *p; - int width = root->end < 0x10000 ? 4 : 8; - int depth; - - for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) - if (p->parent == root) - break; - seq_printf(m, "%*s%0*llx-%0*llx : %s\n", - depth * 2, "", - width, (unsigned long long) r->start, - width, (unsigned long long) r->end, - r->name ? r->name : "<BAD>"); - return 0; -} - -static const struct seq_operations resource_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = r_show, -}; - -static int ioports_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &ioport_resource; - } - return res; -} - -static int iomem_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &iomem_resource; - } - return res; -} - -static const struct file_operations proc_ioports_operations = { - .open = ioports_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations proc_iomem_operations = { - .open = iomem_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init ioresources_init(void) -{ - proc_create("ioports", 0, NULL, &proc_ioports_operations); - proc_create("iomem", 0, NULL, &proc_iomem_operations); - return 0; -} -__initcall(ioresources_init); - -#endif /* CONFIG_PROC_FS */ - -/* Return the conflict entry if you can't request it */ -static struct resource * __request_resource(struct resource *root, struct resource *new) -{ - resource_size_t start = new->start; - resource_size_t end = new->end; - struct resource *tmp, **p; - - if (end < start) - return root; - if (start < root->start) - return root; - if (end > root->end) - return root; - p = &root->child; - for (;;) { - tmp = *p; - if (!tmp || tmp->start > end) { - new->sibling = tmp; - *p = new; - new->parent = root; - return NULL; - } - p = &tmp->sibling; - if (tmp->end < start) - continue; - return tmp; - } -} - -static int __release_resource(struct resource *old) -{ - struct resource *tmp, **p; - - p = &old->parent->child; - for (;;) { - tmp = *p; - if (!tmp) - break; - if (tmp == old) { - *p = tmp->sibling; - old->parent = NULL; - return 0; - } - p = &tmp->sibling; - } - return -EINVAL; -} - -/** - * request_resource - request and reserve an I/O or memory resource - * @root: root resource descriptor - * @new: resource descriptor desired by caller - * - * Returns 0 for success, negative error code on error. - */ -int request_resource(struct resource *root, struct resource *new) -{ - struct resource *conflict; - - write_lock(&resource_lock); - conflict = __request_resource(root, new); - write_unlock(&resource_lock); - return conflict ? -EBUSY : 0; -} - -EXPORT_SYMBOL(request_resource); - -/** - * release_resource - release a previously reserved resource - * @old: resource pointer - */ -int release_resource(struct resource *old) -{ - int retval; - - write_lock(&resource_lock); - retval = __release_resource(old); - write_unlock(&resource_lock); - return retval; -} - -EXPORT_SYMBOL(release_resource); - -#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) -/* - * Finds the lowest memory reosurce exists within [res->start.res->end) - * the caller must specify res->start, res->end, res->flags. - * If found, returns 0, res is overwritten, if not found, returns -1. - */ -static int find_next_system_ram(struct resource *res) -{ - resource_size_t start, end; - struct resource *p; - - BUG_ON(!res); - - start = res->start; - end = res->end; - BUG_ON(start >= end); - - read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ - if (p->flags != res->flags) - continue; - if (p->start > end) { - p = NULL; - break; - } - if ((p->end >= start) && (p->start < end)) - break; - } - read_unlock(&resource_lock); - if (!p) - return -1; - /* copy data */ - if (res->start < p->start) - res->start = p->start; - if (res->end > p->end) - res->end = p->end; - return 0; -} -int -walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, - int (*func)(unsigned long, unsigned long, void *)) -{ - struct resource res; - unsigned long pfn, len; - u64 orig_end; - int ret = -1; - res.start = (u64) start_pfn << PAGE_SHIFT; - res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; - orig_end = res.end; - while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { - pfn = (unsigned long)(res.start >> PAGE_SHIFT); - len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); - ret = (*func)(pfn, len, arg); - if (ret) - break; - res.start = res.end + 1; - res.end = orig_end; - } - return ret; -} - -#endif - -/* - * Find empty slot in the resource tree given range and alignment. - */ -static int find_resource(struct resource *root, struct resource *new, - resource_size_t size, resource_size_t min, - resource_size_t max, resource_size_t align, - void (*alignf)(void *, struct resource *, - resource_size_t, resource_size_t), - void *alignf_data) -{ - struct resource *this = root->child; - - new->start = root->start; - /* - * Skip past an allocated resource that starts at 0, since the assignment - * of this->start - 1 to new->end below would cause an underflow. - */ - if (this && this->start == 0) { - new->start = this->end + 1; - this = this->sibling; - } - for(;;) { - if (this) - new->end = this->start - 1; - else - new->end = root->end; - if (new->start < min) - new->start = min; - if (new->end > max) - new->end = max; - new->start = ALIGN(new->start, align); - if (alignf) - alignf(alignf_data, new, size, align); - if (new->start < new->end && new->end - new->start >= size - 1) { - new->end = new->start + size - 1; - return 0; - } - if (!this) - break; - new->start = this->end + 1; - this = this->sibling; - } - return -EBUSY; -} - -/** - * allocate_resource - allocate empty slot in the resource tree given range & alignment - * @root: root resource descriptor - * @new: resource descriptor desired by caller - * @size: requested resource region size - * @min: minimum size to allocate - * @max: maximum size to allocate - * @align: alignment requested, in bytes - * @alignf: alignment function, optional, called if not NULL - * @alignf_data: arbitrary data to pass to the @alignf function - */ -int allocate_resource(struct resource *root, struct resource *new, - resource_size_t size, resource_size_t min, - resource_size_t max, resource_size_t align, - void (*alignf)(void *, struct resource *, - resource_size_t, resource_size_t), - void *alignf_data) -{ - int err; - - write_lock(&resource_lock); - err = find_resource(root, new, size, min, max, align, alignf, alignf_data); - if (err >= 0 && __request_resource(root, new)) - err = -EBUSY; - write_unlock(&resource_lock); - return err; -} - -EXPORT_SYMBOL(allocate_resource); - -/* - * Insert a resource into the resource tree. If successful, return NULL, - * otherwise return the conflicting resource (compare to __request_resource()) - */ -static struct resource * __insert_resource(struct resource *parent, struct resource *new) -{ - struct resource *first, *next; - - for (;; parent = first) { - first = __request_resource(parent, new); - if (!first) - return first; - - if (first == parent) - return first; - - if ((first->start > new->start) || (first->end < new->end)) - break; - if ((first->start == new->start) && (first->end == new->end)) - break; - } - - for (next = first; ; next = next->sibling) { - /* Partial overlap? Bad, and unfixable */ - if (next->start < new->start || next->end > new->end) - return next; - if (!next->sibling) - break; - if (next->sibling->start > new->end) - break; - } - - new->parent = parent; - new->sibling = next->sibling; - new->child = first; - - next->sibling = NULL; - for (next = first; next; next = next->sibling) - next->parent = new; - - if (parent->child == first) { - parent->child = new; - } else { - next = parent->child; - while (next->sibling != first) - next = next->sibling; - next->sibling = new; - } - return NULL; -} - -/** - * insert_resource - Inserts a resource in the resource tree - * @parent: parent of the new resource - * @new: new resource to insert - * - * Returns 0 on success, -EBUSY if the resource can't be inserted. - * - * This function is equivalent to request_resource when no conflict - * happens. If a conflict happens, and the conflicting resources - * entirely fit within the range of the new resource, then the new - * resource is inserted and the conflicting resources become children of - * the new resource. - */ -int insert_resource(struct resource *parent, struct resource *new) -{ - struct resource *conflict; - - write_lock(&resource_lock); - conflict = __insert_resource(parent, new); - write_unlock(&resource_lock); - return conflict ? -EBUSY : 0; -} - -/** - * insert_resource_expand_to_fit - Insert a resource into the resource tree - * @root: root resource descriptor - * @new: new resource to insert - * - * Insert a resource into the resource tree, possibly expanding it in order - * to make it encompass any conflicting resources. - */ -void insert_resource_expand_to_fit(struct resource *root, struct resource *new) -{ - if (new->parent) - return; - - write_lock(&resource_lock); - for (;;) { - struct resource *conflict; - - conflict = __insert_resource(root, new); - if (!conflict) - break; - if (conflict == root) - break; - - /* Ok, expand resource to cover the conflict, then try again .. */ - if (conflict->start < new->start) - new->start = conflict->start; - if (conflict->end > new->end) - new->end = conflict->end; - - printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); - } - write_unlock(&resource_lock); -} - -/** - * adjust_resource - modify a resource's start and size - * @res: resource to modify - * @start: new start value - * @size: new size - * - * Given an existing resource, change its start and size to match the - * arguments. Returns 0 on success, -EBUSY if it can't fit. - * Existing children of the resource are assumed to be immutable. - */ -int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) -{ - struct resource *tmp, *parent = res->parent; - resource_size_t end = start + size - 1; - int result = -EBUSY; - - write_lock(&resource_lock); - - if ((start < parent->start) || (end > parent->end)) - goto out; - - for (tmp = res->child; tmp; tmp = tmp->sibling) { - if ((tmp->start < start) || (tmp->end > end)) - goto out; - } - - if (res->sibling && (res->sibling->start <= end)) - goto out; - - tmp = parent->child; - if (tmp != res) { - while (tmp->sibling != res) - tmp = tmp->sibling; - if (start <= tmp->end) - goto out; - } - - res->start = start; - res->end = end; - result = 0; - - out: - write_unlock(&resource_lock); - return result; -} - -static void __init __reserve_region_with_split(struct resource *root, - resource_size_t start, resource_size_t end, - const char *name) -{ - struct resource *parent = root; - struct resource *conflict; - struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); - - if (!res) - return; - - res->name = name; - res->start = start; - res->end = end; - res->flags = IORESOURCE_BUSY; - - for (;;) { - conflict = __request_resource(parent, res); - if (!conflict) - break; - if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) - continue; - } - - /* Uhhuh, that didn't work out.. */ - kfree(res); - res = NULL; - break; - } - - if (!res) { - /* failed, split and try again */ - - /* conflict covered whole area */ - if (conflict->start <= start && conflict->end >= end) - return; - - if (conflict->start > start) - __reserve_region_with_split(root, start, conflict->start-1, name); - if (!(conflict->flags & IORESOURCE_BUSY)) { - resource_size_t common_start, common_end; - - common_start = max(conflict->start, start); - common_end = min(conflict->end, end); - if (common_start < common_end) - __reserve_region_with_split(root, common_start, common_end, name); - } - if (conflict->end < end) - __reserve_region_with_split(root, conflict->end+1, end, name); - } - -} - -void __init reserve_region_with_split(struct resource *root, - resource_size_t start, resource_size_t end, - const char *name) -{ - write_lock(&resource_lock); - __reserve_region_with_split(root, start, end, name); - write_unlock(&resource_lock); -} - -EXPORT_SYMBOL(adjust_resource); - -/** - * resource_alignment - calculate resource's alignment - * @res: resource pointer - * - * Returns alignment on success, 0 (invalid alignment) on failure. - */ -resource_size_t resource_alignment(struct resource *res) -{ - switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { - case IORESOURCE_SIZEALIGN: - return resource_size(res); - case IORESOURCE_STARTALIGN: - return res->start; - default: - return 0; - } -} - -/* - * This is compatibility stuff for IO resources. - * - * Note how this, unlike the above, knows about - * the IO flag meanings (busy etc). - * - * request_region creates a new busy region. - * - * check_region returns non-zero if the area is already busy. - * - * release_region releases a matching busy region. - */ - -#ifndef DDE_LINUX -/** - * __request_region - create a new busy resource region - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * @name: reserving caller's ID string - * @flags: IO resource flags - */ -struct resource * __request_region(struct resource *parent, - resource_size_t start, resource_size_t n, - const char *name, int flags) -{ - struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); - - if (!res) - return NULL; - - res->name = name; - res->start = start; - res->end = start + n - 1; - res->flags = IORESOURCE_BUSY; - res->flags |= flags; - - write_lock(&resource_lock); - - for (;;) { - struct resource *conflict; - - conflict = __request_resource(parent, res); - if (!conflict) - break; - if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) - continue; - } - - /* Uhhuh, that didn't work out.. */ - kfree(res); - res = NULL; - break; - } - write_unlock(&resource_lock); - return res; -} -EXPORT_SYMBOL(__request_region); - -/** - * __check_region - check if a resource region is busy or free - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * - * Returns 0 if the region is free at the moment it is checked, - * returns %-EBUSY if the region is busy. - * - * NOTE: - * This function is deprecated because its use is racy. - * Even if it returns 0, a subsequent call to request_region() - * may fail because another driver etc. just allocated the region. - * Do NOT use it. It will be removed from the kernel. - */ -int __check_region(struct resource *parent, resource_size_t start, - resource_size_t n) -{ - struct resource * res; - - res = __request_region(parent, start, n, "check-region", 0); - if (!res) - return -EBUSY; - - release_resource(res); - kfree(res); - return 0; -} -EXPORT_SYMBOL(__check_region); - -/** - * __release_region - release a previously reserved resource region - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * - * The described resource region must match a currently busy region. - */ -void __release_region(struct resource *parent, resource_size_t start, - resource_size_t n) -{ - struct resource **p; - resource_size_t end; - - p = &parent->child; - end = start + n - 1; - - write_lock(&resource_lock); - - for (;;) { - struct resource *res = *p; - - if (!res) - break; - if (res->start <= start && res->end >= end) { - if (!(res->flags & IORESOURCE_BUSY)) { - p = &res->child; - continue; - } - if (res->start != start || res->end != end) - break; - *p = res->sibling; - write_unlock(&resource_lock); - kfree(res); - return; - } - p = &res->sibling; - } - - write_unlock(&resource_lock); - - printk(KERN_WARNING "Trying to free nonexistent resource " - "<%016llx-%016llx>\n", (unsigned long long)start, - (unsigned long long)end); -} -EXPORT_SYMBOL(__release_region); -#endif /* DDE_LINUX */ - -/* - * Managed region resource - */ -struct region_devres { - struct resource *parent; - resource_size_t start; - resource_size_t n; -}; - -static void devm_region_release(struct device *dev, void *res) -{ - struct region_devres *this = res; - - __release_region(this->parent, this->start, this->n); -} - -static int devm_region_match(struct device *dev, void *res, void *match_data) -{ - struct region_devres *this = res, *match = match_data; - - return this->parent == match->parent && - this->start == match->start && this->n == match->n; -} - -struct resource * __devm_request_region(struct device *dev, - struct resource *parent, resource_size_t start, - resource_size_t n, const char *name) -{ - struct region_devres *dr = NULL; - struct resource *res; - - dr = devres_alloc(devm_region_release, sizeof(struct region_devres), - GFP_KERNEL); - if (!dr) - return NULL; - - dr->parent = parent; - dr->start = start; - dr->n = n; - - res = __request_region(parent, start, n, name, 0); - if (res) - devres_add(dev, dr); - else - devres_free(dr); - - return res; -} -EXPORT_SYMBOL(__devm_request_region); - -void __devm_release_region(struct device *dev, struct resource *parent, - resource_size_t start, resource_size_t n) -{ - struct region_devres match_data = { parent, start, n }; - - __release_region(parent, start, n); - WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, - &match_data)); -} -EXPORT_SYMBOL(__devm_release_region); - -/* - * Called from init/main.c to reserve IO ports. - */ -#define MAXRESERVE 4 -static int __init reserve_setup(char *str) -{ - static int reserved; - static struct resource reserve[MAXRESERVE]; - - for (;;) { - int io_start, io_num; - int x = reserved; - - if (get_option (&str, &io_start) != 2) - break; - if (get_option (&str, &io_num) == 0) - break; - if (x < MAXRESERVE) { - struct resource *res = reserve + x; - res->name = "reserved"; - res->start = io_start; - res->end = io_start + io_num - 1; - res->flags = IORESOURCE_BUSY; - res->child = NULL; - if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) - reserved = x+1; - } - } - return 1; -} - -__setup("reserve=", reserve_setup); - -/* - * Check if the requested addr and size spans more than any slot in the - * iomem resource tree. - */ -int iomem_map_sanity_check(resource_size_t addr, unsigned long size) -{ - struct resource *p = &iomem_resource; - int err = 0; - loff_t l; - - read_lock(&resource_lock); - for (p = p->child; p ; p = r_next(NULL, p, &l)) { - /* - * We can probably skip the resources without - * IORESOURCE_IO attribute? - */ - if (p->start >= addr + size) - continue; - if (p->end < addr) - continue; - if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && - PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) - continue; - /* - * if a resource is "BUSY", it's not a hardware resource - * but a driver mapping of such a resource; we don't want - * to warn for those; some drivers legitimately map only - * partial hardware resources. (example: vesafb) - */ - if (p->flags & IORESOURCE_BUSY) - continue; - - printk(KERN_WARNING "resource map sanity check conflict: " - "0x%llx 0x%llx 0x%llx 0x%llx %s\n", - (unsigned long long)addr, - (unsigned long long)(addr + size - 1), - (unsigned long long)p->start, - (unsigned long long)p->end, - p->name); - err = -1; - break; - } - read_unlock(&resource_lock); - - return err; -} - -#ifdef CONFIG_STRICT_DEVMEM -static int strict_iomem_checks = 1; -#else -static int strict_iomem_checks; -#endif - -/* - * check if an address is reserved in the iomem resource tree - * returns 1 if reserved, 0 if not reserved. - */ -int iomem_is_exclusive(u64 addr) -{ - struct resource *p = &iomem_resource; - int err = 0; - loff_t l; - int size = PAGE_SIZE; - - if (!strict_iomem_checks) - return 0; - - addr = addr & PAGE_MASK; - - read_lock(&resource_lock); - for (p = p->child; p ; p = r_next(NULL, p, &l)) { - /* - * We can probably skip the resources without - * IORESOURCE_IO attribute? - */ - if (p->start >= addr + size) - break; - if (p->end < addr) - continue; - if (p->flags & IORESOURCE_BUSY && - p->flags & IORESOURCE_EXCLUSIVE) { - err = 1; - break; - } - } - read_unlock(&resource_lock); - - return err; -} - -static int __init strict_iomem(char *str) -{ - if (strstr(str, "relaxed")) - strict_iomem_checks = 0; - if (strstr(str, "strict")) - strict_iomem_checks = 1; - return 1; -} - -__setup("iomem=", strict_iomem); diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/sched.c.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/sched.c.svn-base deleted file mode 100644 index 5c51695e..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/sched.c.svn-base +++ /dev/null @@ -1,9654 +0,0 @@ -/* - * kernel/sched.c - * - * Kernel scheduler and related syscalls - * - * Copyright (C) 1991-2002 Linus Torvalds - * - * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and - * make semaphores SMP safe - * 1998-11-19 Implemented schedule_timeout() and related stuff - * by Andrea Arcangeli - * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: - * hybrid priority-list and round-robin design with - * an array-switch method of distributing timeslices - * and per-CPU runqueues. Cleanups and useful suggestions - * by Davide Libenzi, preemptible kernel bits by Robert Love. - * 2003-09-03 Interactivity tuning by Con Kolivas. - * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-04-15 Work begun on replacing all interactivity tuning with a - * fair scheduling design by Con Kolivas. - * 2007-05-05 Load balancing (smp-nice) and other improvements - * by Peter Williams - * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith - * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri - * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, - * Thomas Gleixner, Mike Kravetz - */ - -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/nmi.h> -#include <linux/init.h> -#include <linux/uaccess.h> -#include <linux/highmem.h> -#include <linux/smp_lock.h> -#include <asm/mmu_context.h> -#include <linux/interrupt.h> -#include <linux/capability.h> -#include <linux/completion.h> -#include <linux/kernel_stat.h> -#include <linux/debug_locks.h> -#include <linux/security.h> -#include <linux/notifier.h> -#include <linux/profile.h> -#include <linux/freezer.h> -#include <linux/vmalloc.h> -#include <linux/blkdev.h> -#include <linux/delay.h> -#include <linux/pid_namespace.h> -#include <linux/smp.h> -#include <linux/threads.h> -#include <linux/timer.h> -#include <linux/rcupdate.h> -#include <linux/cpu.h> -#include <linux/cpuset.h> -#include <linux/percpu.h> -#include <linux/kthread.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/sysctl.h> -#include <linux/syscalls.h> -#include <linux/times.h> -#include <linux/tsacct_kern.h> -#include <linux/kprobes.h> -#include <linux/delayacct.h> -#include <linux/reciprocal_div.h> -#include <linux/unistd.h> -#include <linux/pagemap.h> -#include <linux/hrtimer.h> -#include <linux/tick.h> -#include <linux/bootmem.h> -#include <linux/debugfs.h> -#include <linux/ctype.h> -#include <linux/ftrace.h> -#include <trace/sched.h> - -#include <asm/tlb.h> -#include <asm/irq_regs.h> - -#include "sched_cpupri.h" - -#ifdef DDE_LINUX -/* DDE_LINUX implements this function externally */ -extern int try_to_wake_up(struct task_struct *p, unsigned int state, int sync); -#endif - -/** DDE only uses small parts of this. */ -#ifndef DDE_LINUX -/* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) -{ - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); -} - -/* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* - * Helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) - -#define NICE_0_LOAD SCHED_LOAD_SCALE -#define NICE_0_SHIFT SCHED_LOAD_SHIFT - -/* - * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. - */ -#define DEF_TIMESLICE (100 * HZ / 1000) - -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) - -DEFINE_TRACE(sched_wait_task); -DEFINE_TRACE(sched_wakeup); -DEFINE_TRACE(sched_wakeup_new); -DEFINE_TRACE(sched_switch); -DEFINE_TRACE(sched_migrate_task); - -#ifdef CONFIG_SMP - -static void double_rq_lock(struct rq *rq1, struct rq *rq2); - -/* - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) - * Since cpu_power is a 'constant', we can use a reciprocal divide. - */ -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) -{ - return reciprocal_divide(load, sg->reciprocal_cpu_power); -} - -/* - * Each time a sched group cpu_power is changed, - * we must compute its reciprocal value - */ -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -{ - sg->__cpu_power += val; - sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -} -#endif - -static inline int rt_policy(int policy) -{ - if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) - return 1; - return 0; -} - -static inline int task_has_rt_policy(struct task_struct *p) -{ - return rt_policy(p->policy); -} - -/* - * This is the priority-queue data structure of the RT scheduling class: - */ -struct rt_prio_array { - DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_RT_PRIO]; -}; - -struct rt_bandwidth { - /* nests inside the rq lock: */ - spinlock_t rt_runtime_lock; - ktime_t rt_period; - u64 rt_runtime; - struct hrtimer rt_period_timer; -}; - -static struct rt_bandwidth def_rt_bandwidth; - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); - -static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) -{ - struct rt_bandwidth *rt_b = - container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; - int idle = 0; - - for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - - if (!overrun) - break; - - idle = do_sched_rt_period_timer(rt_b, overrun); - } - - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; -} - -static -void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) -{ - rt_b->rt_period = ns_to_ktime(period); - rt_b->rt_runtime = runtime; - - spin_lock_init(&rt_b->rt_runtime_lock); - - hrtimer_init(&rt_b->rt_period_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rt_b->rt_period_timer.function = sched_rt_period_timer; -} - -static inline int rt_bandwidth_enabled(void) -{ - return sysctl_sched_rt_runtime >= 0; -} - -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - ktime_t now; - - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - - spin_lock(&rt_b->rt_runtime_lock); - for (;;) { - if (hrtimer_active(&rt_b->rt_period_timer)) - break; - - now = hrtimer_cb_get_time(&rt_b->rt_period_timer); - hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - hrtimer_start_expires(&rt_b->rt_period_timer, - HRTIMER_MODE_ABS); - } - spin_unlock(&rt_b->rt_runtime_lock); -} - -#ifdef CONFIG_RT_GROUP_SCHED -static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - hrtimer_cancel(&rt_b->rt_period_timer); -} -#endif - -/* - * sched_domains_mutex serializes calls to arch_init_sched_domains, - * detach_destroy_domains and partition_sched_domains. - */ -static DEFINE_MUTEX(sched_domains_mutex); - -#ifdef CONFIG_GROUP_SCHED - -#include <linux/cgroup.h> - -struct cfs_rq; - -static LIST_HEAD(task_groups); - -/* task group related information */ -struct task_group { -#ifdef CONFIG_CGROUP_SCHED - struct cgroup_subsys_state css; -#endif - -#ifdef CONFIG_USER_SCHED - uid_t uid; -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each cpu */ - struct sched_entity **se; - /* runqueue "owned" by this group on each cpu */ - struct cfs_rq **cfs_rq; - unsigned long shares; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity **rt_se; - struct rt_rq **rt_rq; - - struct rt_bandwidth rt_bandwidth; -#endif - - struct rcu_head rcu; - struct list_head list; - - struct task_group *parent; - struct list_head siblings; - struct list_head children; -}; - -#ifdef CONFIG_USER_SCHED - -/* Helper function to pass uid information to create_sched_user() */ -void set_tg_uid(struct user_struct *user) -{ - user->tg->uid = user->uid; -} - -/* - * Root task group. - * Every UID task group (including init_task_group aka UID-0) will - * be a child to this group. - */ -struct task_group root_task_group; - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Default task group's sched entity on each cpu */ -static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); -/* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; -#endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_USER_SCHED */ -#define root_task_group init_task_group -#endif /* CONFIG_USER_SCHED */ - -/* task_group_lock serializes add/remove of task groups and also changes to - * a task group's cpu shares. - */ -static DEFINE_SPINLOCK(task_group_lock); - -#ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_USER_SCHED -# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else /* !CONFIG_USER_SCHED */ -# define INIT_TASK_GROUP_LOAD NICE_0_LOAD -#endif /* CONFIG_USER_SCHED */ - -/* - * A weight of 0 or 1 can cause arithmetics problems. - * A weight of a cfs_rq is the sum of weights of which entities - * are queued on this cfs_rq, so a weight of a entity should not be - * too large, so as the shares value of a task group. - * (The default weight is 1024 - so there's no practical - * limitation from this.) - */ -#define MIN_SHARES 2 -#define MAX_SHARES (1UL << 18) - -static int init_task_group_load = INIT_TASK_GROUP_LOAD; -#endif - -/* Default task group. - * Every task in system belong to this group at bootup. - */ -struct task_group init_task_group; - -/* return group to which a task belongs */ -static inline struct task_group *task_group(struct task_struct *p) -{ - struct task_group *tg; - -#ifdef CONFIG_USER_SCHED - rcu_read_lock(); - tg = __task_cred(p)->user->tg; - rcu_read_unlock(); -#elif defined(CONFIG_CGROUP_SCHED) - tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), - struct task_group, css); -#else - tg = &init_task_group; -#endif - return tg; -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; - p->se.parent = task_group(p)->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - p->rt.rt_rq = task_group(p)->rt_rq[cpu]; - p->rt.parent = task_group(p)->rt_se[cpu]; -#endif -} - -#else - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ - return NULL; -} - -#endif /* CONFIG_GROUP_SCHED */ - -/* CFS-related fields in a runqueue */ -struct cfs_rq { - struct load_weight load; - unsigned long nr_running; - - u64 exec_clock; - u64 min_vruntime; - - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; - - struct list_head tasks; - struct list_head *balance_iterator; - - /* - * 'curr' points to currently running entity on this cfs_rq. - * It is set to NULL otherwise (i.e when none are currently running). - */ - struct sched_entity *curr, *next, *last; - - unsigned int nr_spread_over; - -#ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. - */ - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ - -#ifdef CONFIG_SMP - /* - * the part of load.weight contributed by tasks - */ - unsigned long task_weight; - - /* - * h_load = weight * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long h_load; - - /* - * this cpu's part of tg->shares - */ - unsigned long shares; - - /* - * load.weight at the time we set shares - */ - unsigned long rq_weight; -#endif -#endif -}; - -/* Real-Time classes' related field in a runqueue: */ -struct rt_rq { - struct rt_prio_array active; - unsigned long rt_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - int highest_prio; /* highest queued rt task prio */ -#endif -#ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - int overloaded; -#endif - int rt_throttled; - u64 rt_time; - u64 rt_runtime; - /* Nests inside the rq lock: */ - spinlock_t rt_runtime_lock; - -#ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; - - struct rq *rq; - struct list_head leaf_rt_rq_list; - struct task_group *tg; - struct sched_rt_entity *rt_se; -#endif -}; - -#ifdef CONFIG_SMP - -/* - * We add the notion of a root-domain which will be used to define per-domain - * variables. Each exclusive cpuset essentially defines an island domain by - * fully partitioning the member cpus from any other cpuset. Whenever a new - * exclusive cpuset is created, we also create and attach a new root-domain - * object. - * - */ -struct root_domain { - atomic_t refcount; - cpumask_var_t span; - cpumask_var_t online; - - /* - * The "RT overload" flag: it gets set if a CPU has more than - * one runnable RT task. - */ - cpumask_var_t rto_mask; - atomic_t rto_count; -#ifdef CONFIG_SMP - struct cpupri cpupri; -#endif -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Preferred wake up cpu nominated by sched_mc balance that will be - * used when most cpus are idle in the system indicating overall very - * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) - */ - unsigned int sched_mc_preferred_wakeup_cpu; -#endif -}; - -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -static struct root_domain def_root_domain; - -#endif - -/* - * This is the main, per-CPU runqueue data structure. - * - * Locking rule: those places that want to lock multiple runqueues - * (such as the load balancing or the thread migration code), lock - * acquire operations must be ordered by ascending &runqueue. - */ -struct rq { - /* runqueue lock: */ - spinlock_t lock; - - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ - unsigned long nr_running; - #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned char idle_at_tick; -#ifdef CONFIG_NO_HZ - unsigned long last_tick_seen; - unsigned char in_nohz_recently; -#endif - /* capture load from *all* tasks on this cpu: */ - struct load_weight load; - unsigned long nr_load_updates; - u64 nr_switches; - - struct cfs_rq cfs; - struct rt_rq rt; - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this cpu: */ - struct list_head leaf_cfs_rq_list; -#endif -#ifdef CONFIG_RT_GROUP_SCHED - struct list_head leaf_rt_rq_list; -#endif - - /* - * This is part of a global counter where only the total sum - * over all CPUs matters. A task can increase this counter on - * one CPU and if it got migrated afterwards it may decrease - * it on another CPU. Always updated under the runqueue lock: - */ - unsigned long nr_uninterruptible; - - struct task_struct *curr, *idle; - unsigned long next_balance; - struct mm_struct *prev_mm; - - u64 clock; - - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; - - /* For active balancing */ - int active_balance; - int push_cpu; - /* cpu of this runqueue: */ - int cpu; - int online; - - unsigned long avg_load_per_task; - - struct task_struct *migration_thread; - struct list_head migration_queue; -#endif - -#ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP - int hrtick_csd_pending; - struct call_single_data hrtick_csd; -#endif - struct hrtimer hrtick_timer; -#endif - -#ifdef CONFIG_SCHEDSTATS - /* latency stats */ - struct sched_info rq_sched_info; - unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ - - /* sys_sched_yield() stats */ - unsigned int yld_exp_empty; - unsigned int yld_act_empty; - unsigned int yld_both_empty; - unsigned int yld_count; - - /* schedule() stats */ - unsigned int sched_switch; - unsigned int sched_count; - unsigned int sched_goidle; - - /* try_to_wake_up() stats */ - unsigned int ttwu_count; - unsigned int ttwu_local; - - /* BKL stats */ - unsigned int bkl_count; -#endif -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); - -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) -{ - rq->curr->sched_class->check_preempt_curr(rq, p, sync); -} - -static inline int cpu_of(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq->cpu; -#else - return 0; -#endif -} - -/* - * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. - * - * The domain tree of any CPU may only be accessed from within - * preempt-disabled sections. - */ -#define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) - -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) - -static inline void update_rq_clock(struct rq *rq) -{ - rq->clock = sched_clock_cpu(cpu_of(rq)); -} - -/* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: - */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug static const -#endif - -/** - * runqueue_is_locked - * - * Returns true if the current cpu runqueue is locked. - * This interface allows printk to be called with the runqueue lock - * held and know whether or not it is OK to wake up the klogd. - */ -int runqueue_is_locked(void) -{ - int cpu = get_cpu(); - struct rq *rq = cpu_rq(cpu); - int ret; - - ret = spin_is_locked(&rq->lock); - put_cpu(); - return ret; -} - -/* - * Debugging: various feature bits - */ - -#define SCHED_FEAT(name, enabled) \ - __SCHED_FEAT_##name , - -enum { -#include "sched_features.h" -}; - -#undef SCHED_FEAT - -#define SCHED_FEAT(name, enabled) \ - (1UL << __SCHED_FEAT_##name) * enabled | - -const_debug unsigned int sysctl_sched_features = -#include "sched_features.h" - 0; - -#undef SCHED_FEAT - -#ifdef CONFIG_SCHED_DEBUG -#define SCHED_FEAT(name, enabled) \ - #name , - -static __read_mostly char *sched_feat_names[] = { -#include "sched_features.h" - NULL -}; - -#undef SCHED_FEAT - -static int sched_feat_show(struct seq_file *m, void *v) -{ - int i; - - for (i = 0; sched_feat_names[i]; i++) { - if (!(sysctl_sched_features & (1UL << i))) - seq_puts(m, "NO_"); - seq_printf(m, "%s ", sched_feat_names[i]); - } - seq_puts(m, "\n"); - - return 0; -} - -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp = buf; - int neg = 0; - int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - if (strncmp(buf, "NO_", 3) == 0) { - neg = 1; - cmp += 3; - } - - for (i = 0; sched_feat_names[i]; i++) { - int len = strlen(sched_feat_names[i]); - - if (strncmp(cmp, sched_feat_names[i], len) == 0) { - if (neg) - sysctl_sched_features &= ~(1UL << i); - else - sysctl_sched_features |= (1UL << i); - break; - } - } - - if (!sched_feat_names[i]) - return -EINVAL; - - filp->f_pos += cnt; - - return cnt; -} - -static int sched_feat_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_feat_show, NULL); -} - -static struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .write = sched_feat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static __init int sched_init_debug(void) -{ - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); - - return 0; -} -late_initcall(sched_init_debug); - -#endif - -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) - -/* - * Number of tasks to iterate in a single balance run. - * Limited because this is done with IRQs disabled. - */ -const_debug unsigned int sysctl_sched_nr_migrate = 32; - -/* - * ratelimit for updating the group shares. - * default: 0.25ms - */ -unsigned int sysctl_sched_shares_ratelimit = 250000; - -/* - * Inject some fuzzyness into changing the per-cpu group shares - * this avoids remote rq-locks at the expense of fairness. - * default: 4 - */ -unsigned int sysctl_sched_shares_thresh = 4; - -/* - * period over which we measure -rt task cpu usage in us. - * default: 1s - */ -unsigned int sysctl_sched_rt_period = 1000000; - -static __read_mostly int scheduler_running; - -/* - * part of the period that we allow rt tasks to run in us. - * default: 0.95s - */ -int sysctl_sched_rt_runtime = 950000; - -static inline u64 global_rt_period(void) -{ - return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; -} - -static inline u64 global_rt_runtime(void) -{ - if (sysctl_sched_rt_runtime < 0) - return RUNTIME_INF; - - return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; -} - -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif - -static inline int task_current(struct rq *rq, struct task_struct *p) -{ - return rq->curr == p; -} - -#ifndef __ARCH_WANT_UNLOCKED_CTXSW -static inline int task_running(struct rq *rq, struct task_struct *p) -{ - return task_current(rq, p); -} - -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - - spin_unlock_irq(&rq->lock); -} - -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} - -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->oncpu = 1; -#endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - spin_unlock_irq(&rq->lock); -#else - spin_unlock(&rq->lock); -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->oncpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->oncpu = 0; -#endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - -/* - * __task_rq_lock - lock the runqueue a given task resides on. - * Must be called interrupts disabled. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - for (;;) { - struct rq *rq = task_rq(p); - spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - spin_unlock(&rq->lock); - } -} - -/* - * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without - * explicitly disabling preemption. - */ -static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - local_irq_save(*flags); - rq = task_rq(p); - spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - spin_unlock_irqrestore(&rq->lock, *flags); - } -} - -void task_rq_unlock_wait(struct task_struct *p) -{ - struct rq *rq = task_rq(p); - - smp_mb(); /* spin-unlock-wait is not a full memory barrier */ - spin_unlock_wait(&rq->lock); -} - -static void __task_rq_unlock(struct rq *rq) - __releases(rq->lock) -{ - spin_unlock(&rq->lock); -} - -static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) - __releases(rq->lock) -{ - spin_unlock_irqrestore(&rq->lock, *flags); -} - -/* - * this_rq_lock - lock this runqueue and disable interrupts. - */ -static struct rq *this_rq_lock(void) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_disable(); - rq = this_rq(); - spin_lock(&rq->lock); - - return rq; -} - -#ifdef CONFIG_SCHED_HRTICK -/* - * Use HR-timers to deliver accurate preemption points. - * - * Its all a bit involved since we cannot program an hrt while holding the - * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a - * reschedule event. - * - * When we get rescheduled we reprogram the hrtick_timer outside of the - * rq->lock. - */ - -/* - * Use hrtick when: - * - enabled by features - * - hrtimer is actually high res - */ -static inline int hrtick_enabled(struct rq *rq) -{ - if (!sched_feat(HRTICK)) - return 0; - if (!cpu_active(cpu_of(rq))) - return 0; - return hrtimer_is_hres_active(&rq->hrtick_timer); -} - -static void hrtick_clear(struct rq *rq) -{ - if (hrtimer_active(&rq->hrtick_timer)) - hrtimer_cancel(&rq->hrtick_timer); -} - -/* - * High-resolution timer tick. - * Runs from hardirq context with interrupts disabled. - */ -static enum hrtimer_restart hrtick(struct hrtimer *timer) -{ - struct rq *rq = container_of(timer, struct rq, hrtick_timer); - - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - - spin_lock(&rq->lock); - update_rq_clock(rq); - rq->curr->sched_class->task_tick(rq, rq->curr, 1); - spin_unlock(&rq->lock); - - return HRTIMER_NORESTART; -} - -#ifdef CONFIG_SMP -/* - * called from hardirq (IPI) context - */ -static void __hrtick_start(void *arg) -{ - struct rq *rq = arg; - - spin_lock(&rq->lock); - hrtimer_restart(&rq->hrtick_timer); - rq->hrtick_csd_pending = 0; - spin_unlock(&rq->lock); -} - -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and irqs disabled - */ -static void hrtick_start(struct rq *rq, u64 delay) -{ - struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = ktime_add_ns(timer->base->get_time(), delay); - - hrtimer_set_expires(timer, time); - - if (rq == this_rq()) { - hrtimer_restart(timer); - } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); - rq->hrtick_csd_pending = 1; - } -} - -static int -hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - hrtick_clear(cpu_rq(cpu)); - return NOTIFY_OK; - } - - return NOTIFY_DONE; -} - -static __init void init_hrtick(void) -{ - hotcpu_notifier(hotplug_hrtick, 0); -} -#else -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and irqs disabled - */ -static void hrtick_start(struct rq *rq, u64 delay) -{ - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); -} - -static inline void init_hrtick(void) -{ -} -#endif /* CONFIG_SMP */ - -static void init_rq_hrtick(struct rq *rq) -{ -#ifdef CONFIG_SMP - rq->hrtick_csd_pending = 0; - - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; -#endif - - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rq->hrtick_timer.function = hrtick; -} -#else /* CONFIG_SCHED_HRTICK */ -static inline void hrtick_clear(struct rq *rq) -{ -} - -static inline void init_rq_hrtick(struct rq *rq) -{ -} - -static inline void init_hrtick(void) -{ -} -#endif /* CONFIG_SCHED_HRTICK */ - -/* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. - */ -#ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) -#endif - -static void resched_task(struct task_struct *p) -{ - int cpu; - - assert_spin_locked(&task_rq(p)->lock); - - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) - return; - - set_tsk_thread_flag(p, TIF_NEED_RESCHED); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) - return; - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); -} - -static void resched_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - if (!spin_trylock_irqsave(&rq->lock, flags)) - return; - resched_task(cpu_curr(cpu)); - spin_unlock_irqrestore(&rq->lock, flags); -} - -#ifdef CONFIG_NO_HZ -/* - * When add_timer_on() enqueues a timer into the timer wheel of an - * idle CPU then this timer might expire before the next timer event - * which is scheduled to wake up that CPU. In case of a completely - * idle system the next event might even be infinite time into the - * future. wake_up_idle_cpu() ensures that the CPU is woken up and - * leaves the inner idle loop so the newly added timer is taken into - * account when the CPU goes back to idle and evaluates the timer - * wheel for the next timer event. - */ -void wake_up_idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (cpu == smp_processor_id()) - return; - - /* - * This is safe, as this function is called with the timer - * wheel base lock of (cpu) held. When the CPU is on the way - * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new - * timer into account automatically. - */ - if (rq->curr != rq->idle) - return; - - /* - * We can set TIF_RESCHED on the idle task of the other CPU - * lockless. The worst case is that the other CPU runs the - * idle task through an additional NOOP schedule() - */ - set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(rq->idle)) - smp_send_reschedule(cpu); -} -#endif /* CONFIG_NO_HZ */ - -#else /* !CONFIG_SMP */ -static void resched_task(struct task_struct *p) -{ - assert_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} -#endif /* CONFIG_SMP */ - -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - -#define WMULT_SHIFT 32 - -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) - -/* - * delta *= weight / lw - */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) -{ - u64 tmp; - - if (!lw->inv_weight) { - if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) - lw->inv_weight = 1; - else - lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) - / (lw->weight+1); - } - - tmp = (u64)delta_exec * weight; - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); - - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); -} - -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ - lw->weight += inc; - lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ - lw->weight -= dec; - lw->inv_weight = 0; -} - -/* - * To aid in avoiding the subversion of "niceness" due to uneven distribution - * of tasks with abnormal "nice" values across CPUs the contribution that - * each task makes to its run queue's load is weighted according to its - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a - * scaled version of the new time slice allocation that they receive on time - * slice expiry etc. - */ - -#define WEIGHT_IDLEPRIO 3 -#define WMULT_IDLEPRIO 1431655765 - -/* - * Nice levels are multiplicative, with a gentle 10% change for every - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to - * nice 1, it will get ~10% less CPU time than another CPU-bound task - * that remained on nice 0. - * - * The "10% effect" is relative and cumulative: from _any_ nice level, - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. - * If a task goes up by ~10% and another task goes down by ~10% then - * the relative distance between them is ~25%.) - */ -static const int prio_to_weight[40] = { - /* -20 */ 88761, 71755, 56483, 46273, 36291, - /* -15 */ 29154, 23254, 18705, 14949, 11916, - /* -10 */ 9548, 7620, 6100, 4904, 3906, - /* -5 */ 3121, 2501, 1991, 1586, 1277, - /* 0 */ 1024, 820, 655, 526, 423, - /* 5 */ 335, 272, 215, 172, 137, - /* 10 */ 110, 87, 70, 56, 45, - /* 15 */ 36, 29, 23, 18, 15, -}; - -/* - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. - * - * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions - * into multiplications: - */ -static const u32 prio_to_wmult[40] = { - /* -20 */ 48388, 59856, 76040, 92818, 118348, - /* -15 */ 147320, 184698, 229616, 287308, 360437, - /* -10 */ 449829, 563644, 704093, 875809, 1099582, - /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, - /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, - /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, - /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, - /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, -}; - -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); - -/* - * runqueue iterator, to support SMP load-balancing between different - * scheduling classes, without having to expose their internal data - * structures to the load-balancing proper: - */ -struct rq_iterator { - void *arg; - struct task_struct *(*start)(void *); - struct task_struct *(*next)(void *); -}; - -#ifdef CONFIG_SMP -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct rq_iterator *iterator); - -static int -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle, - struct rq_iterator *iterator); -#endif - -#ifdef CONFIG_CGROUP_CPUACCT -static void cpuacct_charge(struct task_struct *tsk, u64 cputime); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -#endif - -static inline void inc_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_add(&rq->load, load); -} - -static inline void dec_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_sub(&rq->load, load); -} - -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) -typedef int (*tg_visitor)(struct task_group *, void *); - -/* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. - */ -static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) -{ - struct task_group *parent, *child; - int ret; - - rcu_read_lock(); - parent = &root_task_group; -down: - ret = (*down)(parent, data); - if (ret) - goto out_unlock; - list_for_each_entry_rcu(child, &parent->children, siblings) { - parent = child; - goto down; - -up: - continue; - } - ret = (*up)(parent, data); - if (ret) - goto out_unlock; - - child = parent; - parent = parent->parent; - if (parent) - goto up; -out_unlock: - rcu_read_unlock(); - - return ret; -} - -static int tg_nop(struct task_group *tg, void *data) -{ - return 0; -} -#endif - -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); - - if (nr_running) - rq->avg_load_per_task = rq->load.weight / nr_running; - else - rq->avg_load_per_task = 0; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); - -/* - * Calculate and set the cpu's group shares. - */ -static void -update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) -{ - unsigned long shares; - unsigned long rq_weight; - - if (!tg->se[cpu]) - return; - - rq_weight = tg->cfs_rq[cpu]->rq_weight; - - /* - * \Sum shares * rq_weight - * shares = ----------------------- - * \Sum rq_weight - * - */ - shares = (sd_shares * rq_weight) / sd_rq_weight; - shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - - if (abs(shares - tg->se[cpu]->load.weight) > - sysctl_sched_shares_thresh) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->shares = shares; - - __set_se_shares(tg->se[cpu], shares); - spin_unlock_irqrestore(&rq->lock, flags); - } -} - -/* - * Re-compute the task group their per cpu shares over the given domain. - * This needs to be done in a bottom-up fashion because the rq weight of a - * parent group depends on the shares of its child groups. - */ -static int tg_shares_up(struct task_group *tg, void *data) -{ - unsigned long weight, rq_weight = 0; - unsigned long shares = 0; - struct sched_domain *sd = data; - int i; - - for_each_cpu(i, sched_domain_span(sd)) { - /* - * If there are currently no tasks on the cpu pretend there - * is one of average load so that when a new task gets to - * run here it will not get delayed by group starvation. - */ - weight = tg->cfs_rq[i]->load.weight; - if (!weight) - weight = NICE_0_LOAD; - - tg->cfs_rq[i]->rq_weight = weight; - rq_weight += weight; - shares += tg->cfs_rq[i]->shares; - } - - if ((!shares && rq_weight) || shares > tg->shares) - shares = tg->shares; - - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) - shares = tg->shares; - - for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight); - - return 0; -} - -/* - * Compute the cpu's hierarchical load factor for each task group. - * This needs to be done in a top-down fashion because the load of a child - * group is a fraction of its parents load. - */ -static int tg_load_down(struct task_group *tg, void *data) -{ - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->cfs_rq[cpu]->shares; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_shares(struct sched_domain *sd) -{ - u64 now = cpu_clock(raw_smp_processor_id()); - s64 elapsed = now - sd->last_update; - - if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { - sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, sd); - } -} - -static void update_shares_locked(struct rq *rq, struct sched_domain *sd) -{ - spin_unlock(&rq->lock); - update_shares(sd); - spin_lock(&rq->lock); -} - -static void update_h_load(long cpu) -{ - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); -} - -#else - -static inline void update_shares(struct sched_domain *sd) -{ -} - -static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) -{ -} - -#endif - -/* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. - */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - int ret = 0; - - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); - BUG_ON(1); - } - if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); - ret = 1; - } else - spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); - } - return ret; -} - -static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) - __releases(busiest->lock) -{ - spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); -} -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -#ifdef CONFIG_SMP - cfs_rq->shares = shares; -#endif -} -#endif - -#include "sched_stats.h" -#include "sched_idletask.c" -#include "sched_fair.c" -#include "sched_rt.c" -#ifdef CONFIG_SCHED_DEBUG -# include "sched_debug.c" -#endif - -#define sched_class_highest (&rt_sched_class) -#define for_each_class(class) \ - for (class = sched_class_highest; class; class = class->next) - -static void inc_nr_running(struct rq *rq) -{ - rq->nr_running++; -} - -static void dec_nr_running(struct rq *rq) -{ - rq->nr_running--; -} - -static void set_load_weight(struct task_struct *p) -{ - if (task_has_rt_policy(p)) { - p->se.load.weight = prio_to_weight[0] * 2; - p->se.load.inv_weight = prio_to_wmult[0] >> 1; - return; - } - - /* - * SCHED_IDLE tasks get minimal weight: - */ - if (p->policy == SCHED_IDLE) { - p->se.load.weight = WEIGHT_IDLEPRIO; - p->se.load.inv_weight = WMULT_IDLEPRIO; - return; - } - - p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; - p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; -} - -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) -{ - sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup); - p->se.on_rq = 1; -} - -static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) -{ - if (sleep && p->se.last_wakeup) { - update_avg(&p->se.avg_overlap, - p->se.sum_exec_runtime - p->se.last_wakeup); - p->se.last_wakeup = 0; - } - - sched_info_dequeued(p); - p->sched_class->dequeue_task(rq, p, sleep); - p->se.on_rq = 0; -} - -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) -{ - return p->static_prio; -} - -/* - * Calculate the expected normal priority: i.e. priority - * without taking RT-inheritance into account. Might be - * boosted by interactivity modifiers. Changes upon fork, - * setprio syscalls, and whenever the interactivity - * estimator recalculates. - */ -static inline int normal_prio(struct task_struct *p) -{ - int prio; - - if (task_has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; -} - -/* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might - * be boosted by RT tasks, or might be boosted by - * interactivity modifiers. Will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ -static int effective_prio(struct task_struct *p) -{ - p->normal_prio = normal_prio(p); - /* - * If we are RT tasks or we were boosted to RT priority, - * keep the priority unchanged. Otherwise, update priority - * to the normal priority: - */ - if (!rt_prio(p->prio)) - return p->normal_prio; - return p->prio; -} - -/* - * activate_task - move a task to the runqueue. - */ -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) -{ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - - enqueue_task(rq, p, wakeup); - inc_nr_running(rq); -} - -/* - * deactivate_task - remove a task from the runqueue. - */ -static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) -{ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - - dequeue_task(rq, p, sleep); - dec_nr_running(rq); -} - -/** - * task_curr - is this task currently executing on a CPU? - * @p: the task in question. - */ -inline int task_curr(const struct task_struct *p) -{ - return cpu_curr(task_cpu(p)) == p; -} - -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - -static inline void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio, int running) -{ - if (prev_class != p->sched_class) { - if (prev_class->switched_from) - prev_class->switched_from(rq, p, running); - p->sched_class->switched_to(rq, p, running); - } else - p->sched_class->prio_changed(rq, p, oldprio, running); -} - -#ifdef CONFIG_SMP - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - -/* - * Is this task likely cache-hot: - */ -static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) -{ - s64 delta; - - /* - * Buddy candidates are cache hot: - */ - if (sched_feat(CACHE_HOT_BUDDY) && - (&p->se == cfs_rq_of(&p->se)->next || - &p->se == cfs_rq_of(&p->se)->last)) - return 1; - - if (p->sched_class != &fair_sched_class) - return 0; - - if (sysctl_sched_migration_cost == -1) - return 1; - if (sysctl_sched_migration_cost == 0) - return 0; - - delta = now - p->se.exec_start; - - return delta < (s64)sysctl_sched_migration_cost; -} - - -void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -{ - int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); - u64 clock_offset; - - clock_offset = old_rq->clock - new_rq->clock; - - trace_sched_migrate_task(p, task_cpu(p), new_cpu); - -#ifdef CONFIG_SCHEDSTATS - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; - if (old_cpu != new_cpu) { - schedstat_inc(p, se.nr_migrations); - if (task_hot(p, old_rq->clock, NULL)) - schedstat_inc(p, se.nr_forced2_migrations); - } -#endif - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; - - __set_task_cpu(p, new_cpu); -} - -struct migration_req { - struct list_head list; - - struct task_struct *task; - int dest_cpu; - - struct completion done; -}; - -/* - * The task's runqueue lock must be held. - * Returns true if you have to wait for migration thread. - */ -static int -migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) -{ - struct rq *rq = task_rq(p); - - /* - * If the task is not on a runqueue (and not running), then - * it is sufficient to simply update the task's cpu field. - */ - if (!p->se.on_rq && !task_running(rq, p)) { - set_task_cpu(p, dest_cpu); - return 0; - } - - init_completion(&req->done); - req->task = p; - req->dest_cpu = dest_cpu; - list_add(&req->list, &rq->migration_queue); - - return 1; -} - -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. - * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. - */ -unsigned long wait_task_inactive(struct task_struct *p, long match_state) -{ - unsigned long flags; - int running, on_rq; - unsigned long ncsw; - struct rq *rq; - - for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(rq, p)) { - if (match_state && unlikely(p->state != match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &flags); - trace_sched_wait_task(rq, p); - running = task_running(rq, p); - on_rq = p->se.on_rq; - ncsw = 0; - if (!match_state || p->state == match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, &flags); - - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; - - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } - - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it wa still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(on_rq)) { - schedule_timeout_uninterruptible(1); - continue; - } - - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } - - return ncsw; -} - -/*** - * kick_process - kick a running thread to enter/exit the kernel - * @p: the to-be-kicked thread - * - * Cause a process which is running on another CPU to enter - * kernel-mode, without any delay. (to get signals handled.) - * - * NOTE: this function doesnt have to take the runqueue lock, - * because all it wants to ensure is that the remote task enters - * the kernel. If the IPI races and the task has been migrated - * to another CPU then no harm is done and the purpose has been - * achieved as well. - */ -void kick_process(struct task_struct *p) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if ((cpu != smp_processor_id()) && task_curr(p)) - smp_send_reschedule(cpu); - preempt_enable(); -} - -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) -{ - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; - - do { - unsigned long load, avg_load; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), - &p->cpus_allowed)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); - - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - - for_each_cpu(i, sched_group_cpus(group)) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - - avg_load += load; - } - - /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); - - if (local_group) { - this_load = avg_load; - this = group; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - if (!idlest || 100*this_load < imbalance*min_load) - return NULL; - return idlest; -} - -/* - * find_idlest_cpu - find the idlest cpu among the cpus in group. - */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) -{ - unsigned long load, min_load = ULONG_MAX; - int idlest = -1; - int i; - - /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; - } - } - - return idlest; -} - -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int sched_balance_self(int cpu, int flag) -{ - struct task_struct *t = current; - struct sched_domain *tmp, *sd = NULL; - - for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - break; - if (tmp->flags & flag) - sd = tmp; - } - - if (sd) - update_shares(sd); - - while (sd) { - struct sched_group *group; - int new_cpu, weight; - - if (!(sd->flags & flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, t, cpu); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = cpumask_weight(sched_domain_span(sd)); - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= cpumask_weight(sched_domain_span(tmp))) - break; - if (tmp->flags & flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - return cpu; -} - -#endif /* CONFIG_SMP */ - -/*** - * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread - * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? - * - * Put it on the run-queue if it's not already there. The "current" - * thread is always on the run-queue (except when the actual - * re-schedule is in progress), and as such you're allowed to do - * the simpler "current->state = TASK_RUNNING" to mark yourself - * runnable without the overhead of this. - * - * returns failure only if the task is already active. - */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) -{ - int cpu, orig_cpu, this_cpu, success = 0; - unsigned long flags; - long old_state; - struct rq *rq; - - if (!sched_feat(SYNC_WAKEUPS)) - sync = 0; - -#ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE)) { - struct sched_domain *sd; - - this_cpu = raw_smp_processor_id(); - cpu = task_cpu(p); - - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - update_shares(sd); - break; - } - } - } -#endif - - smp_wmb(); - rq = task_rq_lock(p, &flags); - update_rq_clock(rq); - old_state = p->state; - if (!(old_state & state)) - goto out; - - if (p->se.on_rq) - goto out_running; - - cpu = task_cpu(p); - orig_cpu = cpu; - this_cpu = smp_processor_id(); - -#ifdef CONFIG_SMP - if (unlikely(task_running(rq, p))) - goto out_activate; - - cpu = p->sched_class->select_task_rq(p, sync); - if (cpu != orig_cpu) { - set_task_cpu(p, cpu); - task_rq_unlock(rq, &flags); - /* might preempt at this point */ - rq = task_rq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out; - if (p->se.on_rq) - goto out_running; - - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - } - -#ifdef CONFIG_SCHEDSTATS - schedstat_inc(rq, ttwu_count); - if (cpu == this_cpu) - schedstat_inc(rq, ttwu_local); - else { - struct sched_domain *sd; - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } - } - } -#endif /* CONFIG_SCHEDSTATS */ - -out_activate: -#endif /* CONFIG_SMP */ - schedstat_inc(p, se.nr_wakeups); - if (sync) - schedstat_inc(p, se.nr_wakeups_sync); - if (orig_cpu != cpu) - schedstat_inc(p, se.nr_wakeups_migrate); - if (cpu == this_cpu) - schedstat_inc(p, se.nr_wakeups_local); - else - schedstat_inc(p, se.nr_wakeups_remote); - activate_task(rq, p, 1); - success = 1; - -out_running: - trace_sched_wakeup(rq, p, success); - check_preempt_curr(rq, p, sync); - - p->state = TASK_RUNNING; -#ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); -#endif -out: - current->se.last_wakeup = current->se.sum_exec_runtime; - - task_rq_unlock(rq, &flags); - - return success; -} -#endif /* !DDE_LINUX */ - -int wake_up_process(struct task_struct *p) -{ - return try_to_wake_up(p, TASK_ALL, 0); -} -EXPORT_SYMBOL(wake_up_process); - -int wake_up_state(struct task_struct *p, unsigned int state) -{ - return try_to_wake_up(p, state, 0); -} - -#ifndef DDE_LINUX -/* - * Perform scheduler related setup for a newly forked process p. - * p is forked by current. - * - * __sched_fork() is basic setup used by init_idle() too: - */ -static void __sched_fork(struct task_struct *p) -{ - p->se.exec_start = 0; - p->se.sum_exec_runtime = 0; - p->se.prev_sum_exec_runtime = 0; - p->se.last_wakeup = 0; - p->se.avg_overlap = 0; - -#ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.sum_sleep_runtime = 0; - p->se.sleep_start = 0; - p->se.block_start = 0; - p->se.sleep_max = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - p->se.wait_max = 0; -#endif - - INIT_LIST_HEAD(&p->rt.run_list); - p->se.on_rq = 0; - INIT_LIST_HEAD(&p->se.group_node); - -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&p->preempt_notifiers); -#endif - - /* - * We mark the process as running here, but have not actually - * inserted it onto the runqueue yet. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_RUNNING; -} - -/* - * fork()/clone()-time setup: - */ -void sched_fork(struct task_struct *p, int clone_flags) -{ - int cpu = get_cpu(); - - __sched_fork(p); - -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif - set_task_cpu(p, cpu); - - /* - * Make sure we do not leak PI boosting priority to the child: - */ - p->prio = current->normal_prio; - if (!rt_prio(p->prio)) - p->sched_class = &fair_sched_class; - -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - if (likely(sched_info_on())) - memset(&p->sched_info, 0, sizeof(p->sched_info)); -#endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - p->oncpu = 0; -#endif -#ifdef CONFIG_PREEMPT - /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; -#endif - put_cpu(); -} - -/* - * wake_up_new_task - wake up a newly created task for the first time. - * - * This function will do some initial scheduler statistics housekeeping - * that must be done for every newly created context, then puts the task - * on the runqueue and wakes it. - */ -void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) -{ - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(p, &flags); - BUG_ON(p->state != TASK_RUNNING); - update_rq_clock(rq); - - p->prio = effective_prio(p); - - if (!p->sched_class->task_new || !current->se.on_rq) { - activate_task(rq, p, 0); - } else { - /* - * Let the scheduling class do new task startup - * management (if any): - */ - p->sched_class->task_new(rq, p); - inc_nr_running(rq); - } - trace_sched_wakeup_new(rq, p, 1); - check_preempt_curr(rq, p, 0); -#ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); -#endif - task_rq_unlock(rq, &flags); -} - -#ifdef CONFIG_PREEMPT_NOTIFIERS - -/** - * preempt_notifier_register - tell me when current is being being preempted & rescheduled - * @notifier: notifier struct to register - */ -void preempt_notifier_register(struct preempt_notifier *notifier) -{ - hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -} -EXPORT_SYMBOL_GPL(preempt_notifier_register); - -/** - * preempt_notifier_unregister - no longer interested in preemption notifications - * @notifier: notifier struct to unregister - * - * This is safe to call from within a preemption notifier. - */ -void preempt_notifier_unregister(struct preempt_notifier *notifier) -{ - hlist_del(¬ifier->link); -} -EXPORT_SYMBOL_GPL(preempt_notifier_unregister); - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ - struct preempt_notifier *notifier; - struct hlist_node *node; - - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) - notifier->ops->sched_in(notifier, raw_smp_processor_id()); -} - -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ - struct preempt_notifier *notifier; - struct hlist_node *node; - - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) - notifier->ops->sched_out(notifier, next); -} - -#else /* !CONFIG_PREEMPT_NOTIFIERS */ - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ -} - -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ -} - -#endif /* CONFIG_PREEMPT_NOTIFIERS */ - -/** - * prepare_task_switch - prepare to switch tasks - * @rq: the runqueue preparing to switch - * @prev: the current task that is being switched out - * @next: the task we are going to switch to. - * - * This is called with the rq lock held and interrupts off. It must - * be paired with a subsequent finish_task_switch after the context - * switch. - * - * prepare_task_switch sets up locking and calls architecture specific - * hooks. - */ -static inline void -prepare_task_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) -{ - fire_sched_out_preempt_notifiers(prev, next); - prepare_lock_switch(rq, next); - prepare_arch_switch(next); -} - -/** - * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch - * @prev: the thread we just switched away from. - * - * finish_task_switch must be called after the context switch, paired - * with a prepare_task_switch call before the context switch. - * finish_task_switch will reconcile locking set up by prepare_task_switch, - * and do any other architecture-specific cleanup actions. - * - * Note that we may have delayed dropping an mm in context_switch(). If - * so, we finish that here outside of the runqueue lock. (Doing it - * with the lock held can cause deadlocks; see schedule() for - * details.) - */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) - __releases(rq->lock) -{ - struct mm_struct *mm = rq->prev_mm; - long prev_state; - - rq->prev_mm = NULL; - - /* - * A task struct has one reference for the use as "current". - * If a task dies, then it sets TASK_DEAD in tsk->state and calls - * schedule one last time. The schedule call will never return, and - * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul <manfred@colorfullife.com> - */ - prev_state = prev->state; - finish_arch_switch(prev); - finish_lock_switch(rq, prev); -#ifdef CONFIG_SMP - if (current->sched_class->post_schedule) - current->sched_class->post_schedule(rq); -#endif - - fire_sched_in_preempt_notifiers(current); - if (mm) - mmdrop(mm); - if (unlikely(prev_state == TASK_DEAD)) { - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - put_task_struct(prev); - } -} - -/** - * schedule_tail - first thing a freshly forked thread must call. - * @prev: the thread we just switched away from. - */ -asmlinkage void schedule_tail(struct task_struct *prev) - __releases(rq->lock) -{ - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ - preempt_enable(); -#endif - if (current->set_child_tid) - put_user(task_pid_vnr(current), current->set_child_tid); -} - -/* - * context_switch - switch to the new MM and the new - * thread's register state. - */ -static inline void -context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) -{ - struct mm_struct *mm, *oldmm; - - prepare_task_switch(rq, prev, next); - trace_sched_switch(rq, prev, next); - mm = next->mm; - oldmm = prev->active_mm; - /* - * For paravirt, this is coupled with an exit in switch_to to - * combine the page table reload and the switch backend into - * one hypercall. - */ - arch_enter_lazy_cpu_mode(); - - if (unlikely(!mm)) { - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next); - } else - switch_mm(oldmm, mm, next); - - if (unlikely(!prev->mm)) { - prev->active_mm = NULL; - rq->prev_mm = oldmm; - } - /* - * Since the runqueue lock will be released by the next - * task (which is an invalid locking op but in the case - * of the scheduler it's an obvious special-case), so we - * do an early lockdep release here: - */ -#ifndef __ARCH_WANT_UNLOCKED_CTXSW - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -#endif - - /* Here we just switch the register state and the stack. */ - switch_to(prev, next, prev); - - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); -} - -/* - * nr_running, nr_uninterruptible and nr_context_switches: - * - * externally visible scheduler statistics: current number of runnable - * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. - */ -unsigned long nr_running(void) -{ - unsigned long i, sum = 0; - - for_each_online_cpu(i) - sum += cpu_rq(i)->nr_running; - - return sum; -} - -unsigned long nr_uninterruptible(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_uninterruptible; - - /* - * Since we read the counters lockless, it might be slightly - * inaccurate. Do not allow it to go below zero though: - */ - if (unlikely((long)sum < 0)) - sum = 0; - - return sum; -} - -unsigned long long nr_context_switches(void) -{ - int i; - unsigned long long sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_switches; - - return sum; -} - -unsigned long nr_iowait(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += atomic_read(&cpu_rq(i)->nr_iowait); - - return sum; -} - -unsigned long nr_active(void) -{ - unsigned long i, running = 0, uninterruptible = 0; - - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } - - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; - - return running + uninterruptible; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). - */ -static void update_cpu_load(struct rq *this_rq) -{ - unsigned long this_load = this_rq->load.weight; - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; - } -} - -#ifdef CONFIG_SMP - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq1 < rq2) { - spin_lock(&rq1->lock); - spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); - } else { - spin_lock(&rq2->lock); - spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); - } - } - update_rq_clock(rq1); - update_rq_clock(rq2); -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - spin_unlock(&rq1->lock); - if (rq1 != rq2) - spin_unlock(&rq2->lock); - else - __release(rq2->lock); -} - -/* - * If dest_cpu is allowed for this process, migrate the task to it. - * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then - * the cpu_allowed mask is restored. - */ -static void sched_migrate_task(struct task_struct *p, int dest_cpu) -{ - struct migration_req req; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(p, &flags); - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) - || unlikely(!cpu_active(dest_cpu))) - goto out; - - /* force the process onto the specified CPU */ - if (migrate_task(p, dest_cpu, &req)) { - /* Need to wait for migration thread (might exit: take ref). */ - struct task_struct *mt = rq->migration_thread; - - get_task_struct(mt); - task_rq_unlock(rq, &flags); - wake_up_process(mt); - put_task_struct(mt); - wait_for_completion(&req.done); - - return; - } -out: - task_rq_unlock(rq, &flags); -} - -/* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. - */ -void sched_exec(void) -{ - int new_cpu, this_cpu = get_cpu(); - new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); - put_cpu(); - if (new_cpu != this_cpu) - sched_migrate_task(current, new_cpu); -} - -/* - * pull_task - move a task from a remote runqueue to the local runqueue. - * Both runqueues must be locked. - */ -static void pull_task(struct rq *src_rq, struct task_struct *p, - struct rq *this_rq, int this_cpu) -{ - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ - check_preempt_curr(this_rq, p, 0); -} - -/* - * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? - */ -static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) -{ - /* - * We do not migrate tasks that are: - * 1) running (obviously), or - * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. - */ - if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { - schedstat_inc(p, se.nr_failed_migrations_affine); - return 0; - } - *all_pinned = 0; - - if (task_running(rq, p)) { - schedstat_inc(p, se.nr_failed_migrations_running); - return 0; - } - - /* - * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. - */ - - if (!task_hot(p, rq->clock, sd) || - sd->nr_balance_failed > sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, rq->clock, sd)) { - schedstat_inc(sd, lb_hot_gained[idle]); - schedstat_inc(p, se.nr_forced_migrations); - } -#endif - return 1; - } - - if (task_hot(p, rq->clock, sd)) { - schedstat_inc(p, se.nr_failed_migrations_hot); - return 0; - } - return 1; -} - -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct rq_iterator *iterator) -{ - int loops = 0, pulled = 0, pinned = 0; - struct task_struct *p; - long rem_load_move = max_load_move; - - if (max_load_move == 0) - goto out; - - pinned = 1; - - /* - * Start the load-balancing iterator: - */ - p = iterator->start(iterator->arg); -next: - if (!p || loops++ > sysctl_sched_nr_migrate) - goto out; - - if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { - p = iterator->next(iterator->arg); - goto next; - } - - pull_task(busiest, p, this_rq, this_cpu); - pulled++; - rem_load_move -= p->se.load.weight; - - /* - * We only want to steal up to the prescribed amount of weighted load. - */ - if (rem_load_move > 0) { - if (p->prio < *this_best_prio) - *this_best_prio = p->prio; - p = iterator->next(iterator->arg); - goto next; - } -out: - /* - * Right now, this is one of only two places pull_task() is called, - * so we can safely collect pull_task() stats here rather than - * inside pull_task(). - */ - schedstat_add(sd, lb_gained[idle], pulled); - - if (all_pinned) - *all_pinned = pinned; - - return max_load_move - rem_load_move; -} - -/* - * move_tasks tries to move up to max_load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) -{ - const struct sched_class *class = sched_class_highest; - unsigned long total_load_moved = 0; - int this_best_prio = this_rq->curr->prio; - - do { - total_load_moved += - class->load_balance(this_rq, this_cpu, busiest, - max_load_move - total_load_moved, - sd, idle, all_pinned, &this_best_prio); - class = class->next; - - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) - break; - - } while (class && max_load_move > total_load_moved); - - return total_load_moved > 0; -} - -static int -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle, - struct rq_iterator *iterator) -{ - struct task_struct *p = iterator->start(iterator->arg); - int pinned = 0; - - while (p) { - if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { - pull_task(busiest, p, this_rq, this_cpu); - /* - * Right now, this is only the second place pull_task() - * is called, so we can safely collect pull_task() - * stats here rather than inside pull_task(). - */ - schedstat_inc(sd, lb_gained[idle]); - - return 1; - } - p = iterator->next(iterator->arg); - } - - return 0; -} - -/* - * move_one_task tries to move exactly one task from busiest to this_rq, as - * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - const struct sched_class *class; - - for (class = sched_class_highest; class; class = class->next) - if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) - return 1; - - return 0; -} - -/* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which - * should be moved to restore balance via the imbalance parameter. - */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const struct cpumask *cpus, int *balance) -{ - struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; - unsigned long max_load, avg_load, total_load, this_load, total_pwr; - unsigned long max_pull; - unsigned long busiest_load_per_task, busiest_nr_running; - unsigned long this_load_per_task, this_nr_running; - int load_idx, group_imb = 0; -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - int power_savings_balance = 1; - unsigned long leader_nr_running = 0, min_load_per_task = 0; - unsigned long min_nr_running = ULONG_MAX; - struct sched_group *group_min = NULL, *group_leader = NULL; -#endif - - max_load = this_load = total_load = total_pwr = 0; - busiest_load_per_task = busiest_nr_running = 0; - this_load_per_task = this_nr_running = 0; - - if (idle == CPU_NOT_IDLE) - load_idx = sd->busy_idx; - else if (idle == CPU_NEWLY_IDLE) - load_idx = sd->newidle_idx; - else - load_idx = sd->idle_idx; - - do { - unsigned long load, group_capacity, max_cpu_load, min_cpu_load; - int local_group; - int i; - int __group_imb = 0; - unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long sum_nr_running, sum_weighted_load; - unsigned long sum_avg_load_per_task; - unsigned long avg_load_per_task; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); - - if (local_group) - balance_cpu = cpumask_first(sched_group_cpus(group)); - - /* Tally up the load of all CPUs in the group */ - sum_weighted_load = sum_nr_running = avg_load = 0; - sum_avg_load_per_task = avg_load_per_task = 0; - - max_cpu_load = 0; - min_cpu_load = ~0UL; - - for_each_cpu_and(i, sched_group_cpus(group), cpus) { - struct rq *rq = cpu_rq(i); - - if (*sd_idle && rq->nr_running) - *sd_idle = 0; - - /* Bias balancing toward cpus of our domain */ - if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { - first_idle_cpu = 1; - balance_cpu = i; - } - - load = target_load(i, load_idx); - } else { - load = source_load(i, load_idx); - if (load > max_cpu_load) - max_cpu_load = load; - if (min_cpu_load > load) - min_cpu_load = load; - } - - avg_load += load; - sum_nr_running += rq->nr_running; - sum_weighted_load += weighted_cpuload(i); - - sum_avg_load_per_task += cpu_avg_load_per_task(i); - } - - /* - * First idle cpu or the first cpu(busiest) in this sched group - * is eligible for doing load balancing at this and above - * domains. In the newly idle case, we will allow all the cpu's - * to do the newly idle load balance. - */ - if (idle != CPU_NEWLY_IDLE && local_group && - balance_cpu != this_cpu && balance) { - *balance = 0; - goto ret; - } - - total_load += avg_load; - total_pwr += group->__cpu_power; - - /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); - - - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of two tasks. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ - avg_load_per_task = sg_div_cpu_power(group, - sum_avg_load_per_task * SCHED_LOAD_SCALE); - - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) - __group_imb = 1; - - group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; - - if (local_group) { - this_load = avg_load; - this = group; - this_nr_running = sum_nr_running; - this_load_per_task = sum_weighted_load; - } else if (avg_load > max_load && - (sum_nr_running > group_capacity || __group_imb)) { - max_load = avg_load; - busiest = group; - busiest_nr_running = sum_nr_running; - busiest_load_per_task = sum_weighted_load; - group_imb = __group_imb; - } - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || - !(sd->flags & SD_POWERSAVINGS_BALANCE)) - goto group_next; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && (this_nr_running >= group_capacity || - !this_nr_running)) - power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!power_savings_balance || sum_nr_running >= group_capacity - || !sum_nr_running) - goto group_next; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sum_nr_running < min_nr_running) || - (sum_nr_running == min_nr_running && - cpumask_first(sched_group_cpus(group)) > - cpumask_first(sched_group_cpus(group_min)))) { - group_min = group; - min_nr_running = sum_nr_running; - min_load_per_task = sum_weighted_load / - sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sum_nr_running <= group_capacity - 1) { - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && - cpumask_first(sched_group_cpus(group)) < - cpumask_first(sched_group_cpus(group_leader)))) { - group_leader = group; - leader_nr_running = sum_nr_running; - } - } -group_next: -#endif - group = group->next; - } while (group != sd->groups); - - if (!busiest || this_load >= max_load || busiest_nr_running == 0) - goto out_balanced; - - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - - if (this_load >= avg_load || - 100*max_load <= sd->imbalance_pct*this_load) - goto out_balanced; - - busiest_load_per_task /= busiest_nr_running; - if (group_imb) - busiest_load_per_task = min(busiest_load_per_task, avg_load); - - /* - * We're trying to get all the cpus to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded cpu below the average load, as either of these - * actions would just result in more rebalancing later, and ping-pong - * tasks around. Thus we look for the minimum possible imbalance. - * Negative imbalances (*we* are more loaded than anyone else) will - * be counted as no imbalance for these purposes -- we can't fix that - * by pulling tasks to us. Be careful of negative numbers as they'll - * appear as very large values with unsigned longs. - */ - if (max_load <= busiest_load_per_task) - goto out_balanced; - - /* - * In the presence of smp nice balancing, certain scenarios can have - * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) - */ - if (max_load < avg_load) { - *imbalance = 0; - goto small_imbalance; - } - - /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); - - /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * busiest->__cpu_power, - (avg_load - this_load) * this->__cpu_power) - / SCHED_LOAD_SCALE; - - /* - * if *imbalance is less than the average load per runnable task - * there is no gaurantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (*imbalance < busiest_load_per_task) { - unsigned long tmp, pwr_now, pwr_move; - unsigned int imbn; - -small_imbalance: - pwr_move = pwr_now = 0; - imbn = 2; - if (this_nr_running) { - this_load_per_task /= this_nr_running; - if (busiest_load_per_task > this_load_per_task) - imbn = 1; - } else - this_load_per_task = cpu_avg_load_per_task(this_cpu); - - if (max_load - this_load + busiest_load_per_task >= - busiest_load_per_task * imbn) { - *imbalance = busiest_load_per_task; - return busiest; - } - - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by - * moving them. - */ - - pwr_now += busiest->__cpu_power * - min(busiest_load_per_task, max_load); - pwr_now += this->__cpu_power * - min(this_load_per_task, this_load); - pwr_now /= SCHED_LOAD_SCALE; - - /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(busiest, - busiest_load_per_task * SCHED_LOAD_SCALE); - if (max_load > tmp) - pwr_move += busiest->__cpu_power * - min(busiest_load_per_task, max_load - tmp); - - /* Amount of load we'd add */ - if (max_load * busiest->__cpu_power < - busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(this, - max_load * busiest->__cpu_power); - else - tmp = sg_div_cpu_power(this, - busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += this->__cpu_power * - min(this_load_per_task, this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; - - /* Move if we gain throughput */ - if (pwr_move > pwr_now) - *imbalance = busiest_load_per_task; - } - - return busiest; - -out_balanced: -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - goto ret; - - if (this == group_leader && group_leader != group_min) { - *imbalance = min_load_per_task; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - cpumask_first(sched_group_cpus(group_leader)); - } - return group_min; - } -#endif -ret: - *imbalance = 0; - return NULL; -} - -/* - * find_busiest_queue - find the busiest runqueue among the cpus in group. - */ -static struct rq * -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const struct cpumask *cpus) -{ - struct rq *busiest = NULL, *rq; - unsigned long max_load = 0; - int i; - - for_each_cpu(i, sched_group_cpus(group)) { - unsigned long wl; - - if (!cpumask_test_cpu(i, cpus)) - continue; - - rq = cpu_rq(i); - wl = weighted_cpuload(i); - - if (rq->nr_running == 1 && wl > imbalance) - continue; - - if (wl > max_load) { - max_load = wl; - busiest = rq; - } - } - - return busiest; -} - -/* - * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but - * so long as it is large enough. - */ -#define MAX_PINNED_INTERVAL 512 - -/* - * Check this_cpu to ensure it is balanced within domain. Attempt to move - * tasks if there is an imbalance. - */ -static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, struct cpumask *cpus) -{ - int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; - struct sched_group *group; - unsigned long imbalance; - struct rq *busiest; - unsigned long flags; - - cpumask_setall(cpus); - - /* - * When power savings policy is enabled for the parent domain, idle - * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as CPU_IDLE, instead of - * portraying it as CPU_NOT_IDLE. - */ - if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - sd_idle = 1; - - schedstat_inc(sd, lb_count[idle]); - -redo: - update_shares(sd); - group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - cpus, balance); - - if (*balance == 0) - goto out_balanced; - - if (!group) { - schedstat_inc(sd, lb_nobusyg[idle]); - goto out_balanced; - } - - busiest = find_busiest_queue(group, idle, imbalance, cpus); - if (!busiest) { - schedstat_inc(sd, lb_nobusyq[idle]); - goto out_balanced; - } - - BUG_ON(busiest == this_rq); - - schedstat_add(sd, lb_imbalance[idle], imbalance); - - ld_moved = 0; - if (busiest->nr_running > 1) { - /* - * Attempt to move tasks. If find_busiest_group has found - * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. ld_moved simply stays zero, so it is - * correctly treated as an imbalance. - */ - local_irq_save(flags); - double_rq_lock(this_rq, busiest); - ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &all_pinned); - double_rq_unlock(this_rq, busiest); - local_irq_restore(flags); - - /* - * some other cpu did the load balance for us. - */ - if (ld_moved && this_cpu != smp_processor_id()) - resched_cpu(this_cpu); - - /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(all_pinned)) { - cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) - goto redo; - goto out_balanced; - } - } - - if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); - sd->nr_balance_failed++; - - if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - - spin_lock_irqsave(&busiest->lock, flags); - - /* don't kick the migration_thread, if the curr - * task on busiest cpu can't be moved to this_cpu - */ - if (!cpumask_test_cpu(this_cpu, - &busiest->curr->cpus_allowed)) { - spin_unlock_irqrestore(&busiest->lock, flags); - all_pinned = 1; - goto out_one_pinned; - } - - if (!busiest->active_balance) { - busiest->active_balance = 1; - busiest->push_cpu = this_cpu; - active_balance = 1; - } - spin_unlock_irqrestore(&busiest->lock, flags); - if (active_balance) - wake_up_process(busiest->migration_thread); - - /* - * We've kicked active balancing, reset the failure - * counter. - */ - sd->nr_balance_failed = sd->cache_nice_tries+1; - } - } else - sd->nr_balance_failed = 0; - - if (likely(!active_balance)) { - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * move_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; - } - - if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - - goto out; - -out_balanced: - schedstat_inc(sd, lb_balanced[idle]); - - sd->nr_balance_failed = 0; - -out_one_pinned: - /* tune up the balancing interval */ - if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || - (sd->balance_interval < sd->max_interval)) - sd->balance_interval *= 2; - - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - else - ld_moved = 0; -out: - if (ld_moved) - update_shares(sd); - return ld_moved; -} - -/* - * Check this_cpu to ensure it is balanced within domain. Attempt to move - * tasks if there is an imbalance. - * - * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). - * this_rq is locked. - */ -static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - struct cpumask *cpus) -{ - struct sched_group *group; - struct rq *busiest = NULL; - unsigned long imbalance; - int ld_moved = 0; - int sd_idle = 0; - int all_pinned = 0; - - cpumask_setall(cpus); - - /* - * When power savings policy is enabled for the parent domain, idle - * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as IDLE, instead of - * portraying it as CPU_NOT_IDLE. - */ - if (sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - sd_idle = 1; - - schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); -redo: - update_shares_locked(this_rq, sd); - group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, - &sd_idle, cpus, NULL); - if (!group) { - schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); - goto out_balanced; - } - - busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); - if (!busiest) { - schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); - goto out_balanced; - } - - BUG_ON(busiest == this_rq); - - schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); - - ld_moved = 0; - if (busiest->nr_running > 1) { - /* Attempt to move tasks */ - double_lock_balance(this_rq, busiest); - /* this_rq->clock is already updated */ - update_rq_clock(busiest); - ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, CPU_NEWLY_IDLE, - &all_pinned); - double_unlock_balance(this_rq, busiest); - - if (unlikely(all_pinned)) { - cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) - goto redo; - } - } - - if (!ld_moved) { - int active_balance = 0; - - schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - - if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) - return -1; - - if (sd->nr_balance_failed++ < 2) - return -1; - - /* - * The only task running in a non-idle cpu can be moved to this - * cpu in an attempt to completely freeup the other CPU - * package. The same method used to move task in load_balance() - * have been extended for load_balance_newidle() to speedup - * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) - * - * The package power saving logic comes from - * find_busiest_group(). If there are no imbalance, then - * f_b_g() will return NULL. However when sched_mc={1,2} then - * f_b_g() will select a group from which a running task may be - * pulled to this cpu in order to make the other package idle. - * If there is no opportunity to make a package idle and if - * there are no imbalance, then f_b_g() will return NULL and no - * action will be taken in load_balance_newidle(). - * - * Under normal task pull operation due to imbalance, there - * will be more than one task in the source run queue and - * move_tasks() will succeed. ld_moved will be true and this - * active balance code will not be triggered. - */ - - /* Lock busiest in correct order while this_rq is held */ - double_lock_balance(this_rq, busiest); - - /* - * don't kick the migration_thread, if the curr - * task on busiest cpu can't be moved to this_cpu - */ - if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { - double_unlock_balance(this_rq, busiest); - all_pinned = 1; - return ld_moved; - } - - if (!busiest->active_balance) { - busiest->active_balance = 1; - busiest->push_cpu = this_cpu; - active_balance = 1; - } - - double_unlock_balance(this_rq, busiest); - /* - * Should not call ttwu while holding a rq->lock - */ - spin_unlock(&this_rq->lock); - if (active_balance) - wake_up_process(busiest->migration_thread); - spin_lock(&this_rq->lock); - - } else - sd->nr_balance_failed = 0; - - update_shares_locked(this_rq, sd); - return ld_moved; - -out_balanced: - schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - sd->nr_balance_failed = 0; - - return 0; -} - -/* - * idle_balance is called by schedule() if this_cpu is about to become - * idle. Attempts to pull tasks from other CPUs. - */ -static void idle_balance(int this_cpu, struct rq *this_rq) -{ - struct sched_domain *sd; - int pulled_task = 0; - unsigned long next_balance = jiffies + HZ; - cpumask_var_t tmpmask; - - if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) - return; - - for_each_domain(this_cpu, sd) { - unsigned long interval; - - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - if (sd->flags & SD_BALANCE_NEWIDLE) - /* If we've pulled tasks over stop searching: */ - pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, tmpmask); - - interval = msecs_to_jiffies(sd->balance_interval); - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; - if (pulled_task) - break; - } - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { - /* - * We are going idle. next_balance may be set based on - * a busy processor. So reset next_balance. - */ - this_rq->next_balance = next_balance; - } - free_cpumask_var(tmpmask); -} - -/* - * active_load_balance is run by migration threads. It pushes running tasks - * off the busiest CPU onto idle CPUs. It requires at least 1 task to be - * running on each physical CPU where possible, and avoids physical / - * logical imbalances. - * - * Called with busiest_rq locked. - */ -static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) -{ - int target_cpu = busiest_rq->push_cpu; - struct sched_domain *sd; - struct rq *target_rq; - - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - return; - - target_rq = cpu_rq(target_cpu); - - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - update_rq_clock(busiest_rq); - update_rq_clock(target_rq); - - /* Search for an sd spanning us and the target CPU. */ - for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - - if (likely(sd)) { - schedstat_inc(sd, alb_count); - - if (move_one_task(target_rq, target_cpu, busiest_rq, - sd, CPU_IDLE)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - double_unlock_balance(busiest_rq, target_rq); -} - -#ifdef CONFIG_NO_HZ -static struct { - atomic_t load_balancer; - cpumask_var_t cpu_mask; -} nohz ____cacheline_aligned = { - .load_balancer = ATOMIC_INIT(-1), -}; - -/* - * This routine will try to nominate the ilb (idle load balancing) - * owner among the cpus whose ticks are stopped. ilb owner will do the idle - * load balancing on behalf of all those cpus. If all the cpus in the system - * go into this tickless mode, then there will be no ilb owner (as there is - * no need for one) and all the cpus will sleep till the next wakeup event - * arrives... - * - * For the ilb owner, tick is not stopped. And this tick will be used - * for idle load balancing. ilb owner will still be part of - * nohz.cpu_mask.. - * - * While stopping the tick, this cpu will become the ilb owner if there - * is no other owner. And will be the owner till that cpu becomes busy - * or if all cpus in the system stop their ticks at which point - * there is no need for ilb owner. - * - * When the ilb owner becomes busy, it nominates another owner, during the - * next busy scheduler_tick() - */ -int select_nohz_load_balancer(int stop_tick) -{ - int cpu = smp_processor_id(); - - if (stop_tick) { - cpu_rq(cpu)->in_nohz_recently = 1; - - if (!cpu_active(cpu)) { - if (atomic_read(&nohz.load_balancer) != cpu) - return 0; - - /* - * If we are going offline and still the leader, - * give up! - */ - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) - BUG(); - - return 0; - } - - cpumask_set_cpu(cpu, nohz.cpu_mask); - - /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { - if (atomic_read(&nohz.load_balancer) == cpu) - atomic_set(&nohz.load_balancer, -1); - return 0; - } - - if (atomic_read(&nohz.load_balancer) == -1) { - /* make me the ilb owner */ - if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) - return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) - return 1; - } else { - if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) - return 0; - - cpumask_clear_cpu(cpu, nohz.cpu_mask); - - if (atomic_read(&nohz.load_balancer) == cpu) - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) - BUG(); - } - return 0; -} -#endif - -static DEFINE_SPINLOCK(balancing); - -/* - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in arch_init_sched_domains. - */ -static void rebalance_domains(int cpu, enum cpu_idle_type idle) -{ - int balance = 1; - struct rq *rq = cpu_rq(cpu); - unsigned long interval; - struct sched_domain *sd; - /* Earliest time when we have to do rebalance again */ - unsigned long next_balance = jiffies + 60*HZ; - int update_next_balance = 0; - int need_serialize; - cpumask_var_t tmp; - - /* Fails alloc? Rebalancing probably not a priority right now. */ - if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) - return; - - for_each_domain(cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - interval = sd->balance_interval; - if (idle != CPU_IDLE) - interval *= sd->busy_factor; - - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - if (unlikely(!interval)) - interval = 1; - if (interval > HZ*NR_CPUS/10) - interval = HZ*NR_CPUS/10; - - need_serialize = sd->flags & SD_SERIALIZE; - - if (need_serialize) { - if (!spin_trylock(&balancing)) - goto out; - } - - if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { - /* - * We've pulled tasks over so either we're no - * longer idle, or one of our SMT siblings is - * not idle. - */ - idle = CPU_NOT_IDLE; - } - sd->last_balance = jiffies; - } - if (need_serialize) - spin_unlock(&balancing); -out: - if (time_after(next_balance, sd->last_balance + interval)) { - next_balance = sd->last_balance + interval; - update_next_balance = 1; - } - - /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. - */ - if (!balance) - break; - } - - /* - * next_balance will be updated only when there is a need. - * When the cpu is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) - rq->next_balance = next_balance; - - free_cpumask_var(tmp); -} - -/* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * In CONFIG_NO_HZ case, the idle load balance owner will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. - */ -static void run_rebalance_domains(struct softirq_action *h) -{ - int this_cpu = smp_processor_id(); - struct rq *this_rq = cpu_rq(this_cpu); - enum cpu_idle_type idle = this_rq->idle_at_tick ? - CPU_IDLE : CPU_NOT_IDLE; - - rebalance_domains(this_cpu, idle); - -#ifdef CONFIG_NO_HZ - /* - * If this cpu is the owner for idle load balancing, then do the - * balancing on behalf of the other idle cpus whose ticks are - * stopped. - */ - if (this_rq->idle_at_tick && - atomic_read(&nohz.load_balancer) == this_cpu) { - struct rq *rq; - int balance_cpu; - - for_each_cpu(balance_cpu, nohz.cpu_mask) { - if (balance_cpu == this_cpu) - continue; - - /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load - * balancing owner will pick it up. - */ - if (need_resched()) - break; - - rebalance_domains(balance_cpu, CPU_IDLE); - - rq = cpu_rq(balance_cpu); - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; - } - } -#endif -} - -/* - * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. - * - * In case of CONFIG_NO_HZ, this is the place where we nominate a new - * idle load balancing owner or decide to stop the periodic load balancing, - * if the whole system is idle. - */ -static inline void trigger_load_balance(struct rq *rq, int cpu) -{ -#ifdef CONFIG_NO_HZ - /* - * If we were in the nohz mode recently and busy at the current - * scheduler tick, then check if we need to nominate new idle - * load balancer. - */ - if (rq->in_nohz_recently && !rq->idle_at_tick) { - rq->in_nohz_recently = 0; - - if (atomic_read(&nohz.load_balancer) == cpu) { - cpumask_clear_cpu(cpu, nohz.cpu_mask); - atomic_set(&nohz.load_balancer, -1); - } - - if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = cpumask_first(nohz.cpu_mask); - - if (ilb < nr_cpu_ids) - resched_cpu(ilb); - } - } - - /* - * If this cpu is idle and doing idle load balancing for all the - * cpus with ticks stopped, is it time for that to stop? - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { - resched_cpu(cpu); - return; - } - - /* - * If this cpu is idle and the idle load balancing is done by - * someone else, then no need raise the SCHED_SOFTIRQ - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpumask_test_cpu(cpu, nohz.cpu_mask)) - return; -#endif - if (time_after_eq(jiffies, rq->next_balance)) - raise_softirq(SCHED_SOFTIRQ); -} - -#else /* CONFIG_SMP */ - -/* - * on UP we do not need to balance between CPUs: - */ -static inline void idle_balance(int cpu, struct rq *rq) -{ -} - -#endif - -DEFINE_PER_CPU(struct kernel_stat, kstat); - -EXPORT_PER_CPU_SYMBOL(kstat); - -/* - * Return any ns on the sched_clock that have not yet been banked in - * @p in case that task is currently running. - */ -unsigned long long task_delta_exec(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - u64 ns = 0; - - rq = task_rq_lock(p, &flags); - - if (task_current(rq, p)) { - u64 delta_exec; - - update_rq_clock(rq); - delta_exec = rq->clock - p->se.exec_start; - if ((s64)delta_exec > 0) - ns = delta_exec; - } - - task_rq_unlock(rq, &flags); - - return ns; -} - -/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; - - /* Add user time to process. */ - p->utime = cputime_add(p->utime, cputime); - p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); - account_group_user_time(p, cputime); - - /* Add user time to cpustat. */ - tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) - cpustat->nice = cputime64_add(cpustat->nice, tmp); - else - cpustat->user = cputime64_add(cpustat->user, tmp); - /* Account for user time used */ - acct_update_integrals(p); -} - -/* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - cputime64_t tmp; - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - - tmp = cputime_to_cputime64(cputime); - - /* Add guest time to process. */ - p->utime = cputime_add(p->utime, cputime); - p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); - account_group_user_time(p, cputime); - p->gtime = cputime_add(p->gtime, cputime); - - /* Add guest time to cpustat. */ - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; - - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime, cputime_scaled); - return; - } - - /* Add system time to process. */ - p->stime = cputime_add(p->stime, cputime); - p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - tmp = cputime_to_cputime64(cputime); - if (hardirq_count() - hardirq_offset) - cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else - cpustat->system = cputime64_add(cpustat->system, tmp); - - /* Account for system time used */ - acct_update_integrals(p); -} - -/* - * Account for involuntary wait time. - * @steal: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); - - cpustat->steal = cputime64_add(cpustat->steal, cputime64); -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -void account_idle_time(cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); - else - cpustat->idle = cputime64_add(cpustat->idle, cputime64); -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - -/* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to - * @user_tick: indicates if the tick is a user or a system tick - */ -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy = jiffies_to_cputime(1); - cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); - struct rq *rq = this_rq(); - - if (user_tick) - account_user_time(p, one_jiffy, one_jiffy_scaled); - else if (p != rq->idle) - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, - one_jiffy_scaled); - else - account_idle_time(one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); -} - -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - account_idle_time(jiffies_to_cputime(ticks)); -} - -#endif - -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -cputime_t task_utime(struct task_struct *p) -{ - return p->utime; -} - -cputime_t task_stime(struct task_struct *p) -{ - return p->stime; -} -#else -cputime_t task_utime(struct task_struct *p) -{ - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); - u64 temp; - - /* - * Use CFS's precise accounting: - */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); - - if (total) { - temp *= utime; - do_div(temp, total); - } - utime = (clock_t)temp; - - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); - return p->prev_utime; -} - -cputime_t task_stime(struct task_struct *p) -{ - clock_t stime; - - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); - - if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); - - return p->prev_stime; -} -#endif - -inline cputime_t task_gtime(struct task_struct *p) -{ - return p->gtime; -} - -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. - */ -void scheduler_tick(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - struct task_struct *curr = rq->curr; - - sched_clock_tick(); - - spin_lock(&rq->lock); - update_rq_clock(rq); - update_cpu_load(rq); - curr->sched_class->task_tick(rq, curr, 0); - spin_unlock(&rq->lock); - -#ifdef CONFIG_SMP - rq->idle_at_tick = idle_cpu(cpu); - trigger_load_balance(rq, cpu); -#endif -} - -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) - -static inline unsigned long get_parent_ip(unsigned long addr) -{ - if (in_lock_functions(addr)) { - addr = CALLER_ADDR2; - if (in_lock_functions(addr)) - addr = CALLER_ADDR3; - } - return addr; -} - -void __kprobes add_preempt_count(int val) -{ -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) - return; -#endif - preempt_count() += val; -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Spinlock count overflowing soon? - */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= - PREEMPT_MASK - 10); -#endif - if (preempt_count() == val) - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); -} -EXPORT_SYMBOL(add_preempt_count); - -void __kprobes sub_preempt_count(int val) -{ -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) - return; - /* - * Is the spinlock portion underflowing? - */ - if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && - !(preempt_count() & PREEMPT_MASK))) - return; -#endif - - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - preempt_count() -= val; -} -EXPORT_SYMBOL(sub_preempt_count); - -#endif - -/* - * Print scheduling while atomic bug: - */ -static noinline void __schedule_bug(struct task_struct *prev) -{ - struct pt_regs *regs = get_irq_regs(); - - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); - - debug_show_held_locks(prev); - print_modules(); - if (irqs_disabled()) - print_irqtrace_events(prev); - - if (regs) - show_regs(regs); - else - dump_stack(); -} - -/* - * Various schedule()-time debugging checks and statistics: - */ -static inline void schedule_debug(struct task_struct *prev) -{ - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path for now. - * Otherwise, whine if we are scheduling when we should not be. - */ - if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) - __schedule_bug(prev); - - profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - - schedstat_inc(this_rq(), sched_count); -#ifdef CONFIG_SCHEDSTATS - if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), bkl_count); - schedstat_inc(prev, sched_info.bkl_count); - } -#endif -} - -/* - * Pick up the highest-prio task: - */ -static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) -{ - const struct sched_class *class; - struct task_struct *p; - - /* - * Optimization: we know that if all tasks are in - * the fair class we can call that function directly: - */ - if (likely(rq->nr_running == rq->cfs.nr_running)) { - p = fair_sched_class.pick_next_task(rq); - if (likely(p)) - return p; - } - - class = sched_class_highest; - for ( ; ; ) { - p = class->pick_next_task(rq); - if (p) - return p; - /* - * Will never be NULL as the idle class always - * returns a non-NULL p: - */ - class = class->next; - } -} - -/* - * schedule() is the main scheduler function. - */ -asmlinkage void __sched schedule(void) -{ - struct task_struct *prev, *next; - unsigned long *switch_count; - struct rq *rq; - int cpu; - -need_resched: - preempt_disable(); - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - rcu_qsctr_inc(cpu); - prev = rq->curr; - switch_count = &prev->nivcsw; - - release_kernel_lock(prev); -need_resched_nonpreemptible: - - schedule_debug(prev); - - if (sched_feat(HRTICK)) - hrtick_clear(rq); - - spin_lock_irq(&rq->lock); - update_rq_clock(rq); - clear_tsk_need_resched(prev); - - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) - prev->state = TASK_RUNNING; - else - deactivate_task(rq, prev, 1); - switch_count = &prev->nvcsw; - } - -#ifdef CONFIG_SMP - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -#endif - - if (unlikely(!rq->nr_running)) - idle_balance(cpu, rq); - - prev->sched_class->put_prev_task(rq, prev); - next = pick_next_task(rq, prev); - - if (likely(prev != next)) { - sched_info_switch(prev, next); - - rq->nr_switches++; - rq->curr = next; - ++*switch_count; - - context_switch(rq, prev, next); /* unlocks the rq */ - /* - * the context switch might have flipped the stack from under - * us, hence refresh the local variables. - */ - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - } else - spin_unlock_irq(&rq->lock); - - if (unlikely(reacquire_kernel_lock(current) < 0)) - goto need_resched_nonpreemptible; - - preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; -} -EXPORT_SYMBOL(schedule); - -#ifdef CONFIG_PREEMPT -/* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. - */ -asmlinkage void __sched preempt_schedule(void) -{ - struct thread_info *ti = current_thread_info(); - - /* - * If there is a non-zero preempt_count or interrupts are disabled, - * we do not want to preempt the current task. Just return.. - */ - if (likely(ti->preempt_count || irqs_disabled())) - return; - - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); -} -EXPORT_SYMBOL(preempt_schedule); - -/* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. - */ -asmlinkage void __sched preempt_schedule_irq(void) -{ - struct thread_info *ti = current_thread_info(); - - /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); - - do { - add_preempt_count(PREEMPT_ACTIVE); - local_irq_enable(); - schedule(); - local_irq_disable(); - sub_preempt_count(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); -} - -#endif /* CONFIG_PREEMPT */ -#endif /* !DDE_LINUX */ - -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, - void *key) -{ - return try_to_wake_up(curr->private, mode, sync); -} -EXPORT_SYMBOL(default_wake_function); - -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) -{ - wait_queue_t *curr, *next; - - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { - unsigned flags = curr->flags; - - if (curr->func(curr, mode, sync, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) - break; - } -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: is directly passed to the wakeup function - */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) -{ - __wake_up_common(q, mode, 1, 0, NULL); -} - -/** - * __wake_up_sync - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - */ -void -__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ - unsigned long flags; - int sync = 1; - - if (unlikely(!q)) - return; - - if (unlikely(!nr_exclusive)) - sync = 0; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, NULL); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ - -/** - * complete: - signals a single thread waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up a single thread waiting on this completion. Threads will be - * awakened in the same order in which they were queued. - * - * See also complete_all(), wait_for_completion() and related routines. - */ -void complete(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -/** - * complete_all: - signals all threads waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up all threads waiting on this particular completion event. - */ -void complete_all(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -static inline long __sched -do_wait_for_common(struct completion *x, long timeout, int state) -{ - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); - do { - if (signal_pending_state(state, current)) { - timeout = -ERESTARTSYS; - break; - } - __set_current_state(state); - spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&x->wait.lock); - } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); - if (!x->done) - return timeout; - } - x->done--; - return timeout ?: 1; -} - -static long __sched -wait_for_common(struct completion *x, long timeout, int state) -{ - might_sleep(); - - spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, timeout, state); - spin_unlock_irq(&x->wait.lock); - return timeout; -} - -/** - * wait_for_completion: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. - * - * See also similar routines (i.e. wait_for_completion_timeout()) with timeout - * and interrupt capability. Also see complete(). - */ -void __sched wait_for_completion(struct completion *x) -{ - wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion); - -/** - * wait_for_completion_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. - */ -unsigned long __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_timeout); - -/** - * wait_for_completion_interruptible: - waits for completion of a task (w/intr) - * @x: holds the state of this particular completion - * - * This waits for completion of a specific task to be signaled. It is - * interruptible. - */ -int __sched wait_for_completion_interruptible(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_interruptible); - -/** - * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. It is interruptible. The timeout is in jiffies. - */ -unsigned long __sched -wait_for_completion_interruptible_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -/** - * wait_for_completion_killable: - waits for completion of a task (killable) - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It can be - * interrupted by a kill signal. - */ -int __sched wait_for_completion_killable(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_killable); - -/** - * try_wait_for_completion - try to decrement a completion without blocking - * @x: completion structure - * - * Returns: 0 if a decrement cannot be done without blocking - * 1 if a decrement succeeded. - * - * If a completion is being used as a counting completion, - * attempt to decrement the counter without blocking. This - * enables us to avoid waiting if the resource the completion - * is protecting is not available. - */ -bool try_wait_for_completion(struct completion *x) -{ - int ret = 1; - - spin_lock_irq(&x->wait.lock); - if (!x->done) - ret = 0; - else - x->done--; - spin_unlock_irq(&x->wait.lock); - return ret; -} -EXPORT_SYMBOL(try_wait_for_completion); - -/** - * completion_done - Test to see if a completion has any waiters - * @x: completion structure - * - * Returns: 0 if there are waiters (wait_for_completion() in progress) - * 1 if there are no waiters. - * - */ -bool completion_done(struct completion *x) -{ - int ret = 1; - - spin_lock_irq(&x->wait.lock); - if (!x->done) - ret = 0; - spin_unlock_irq(&x->wait.lock); - return ret; -} -EXPORT_SYMBOL(completion_done); - -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - __set_current_state(state); - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, &wait); - spin_unlock(&q->lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&q->lock); - __remove_wait_queue(q, &wait); - spin_unlock_irqrestore(&q->lock, flags); - - return timeout; -} - -#ifndef DDE_LINUX -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); - -#ifdef CONFIG_RT_MUTEXES - -/* - * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) - * - * This function changes the 'effective' priority of a task. It does - * not touch ->normal_prio like __setscheduler(). - * - * Used by the rt_mutex code to implement priority inheritance logic. - */ -void rt_mutex_setprio(struct task_struct *p, int prio) -{ - unsigned long flags; - int oldprio, on_rq, running; - struct rq *rq; - const struct sched_class *prev_class = p->sched_class; - - BUG_ON(prio < 0 || prio > MAX_PRIO); - - rq = task_rq_lock(p, &flags); - update_rq_clock(rq); - - oldprio = p->prio; - on_rq = p->se.on_rq; - running = task_current(rq, p); - if (on_rq) - dequeue_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - - if (rt_prio(prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; - - p->prio = prio; - - if (running) - p->sched_class->set_curr_task(rq); - if (on_rq) { - enqueue_task(rq, p, 0); - - check_class_changed(rq, p, prev_class, oldprio, running); - } - task_rq_unlock(rq, &flags); -} - -#endif - -void set_user_nice(struct task_struct *p, long nice) -{ - int old_prio, delta, on_rq; - unsigned long flags; - struct rq *rq; - - if (TASK_NICE(p) == nice || nice < -20 || nice > 19) - return; - /* - * We have to be careful, if called from sys_setpriority(), - * the task might be in the middle of scheduling on another CPU. - */ - rq = task_rq_lock(p, &flags); - update_rq_clock(rq); - /* - * The RT priorities are set via sched_setscheduler(), but we still - * allow the 'normal' nice value to be set - but as expected - * it wont have any effect on scheduling until the task is - * SCHED_FIFO/SCHED_RR: - */ - if (task_has_rt_policy(p)) { - p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; - } - on_rq = p->se.on_rq; - if (on_rq) - dequeue_task(rq, p, 0); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); - old_prio = p->prio; - p->prio = effective_prio(p); - delta = p->prio - old_prio; - - if (on_rq) { - enqueue_task(rq, p, 0); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); - } -out_unlock: - task_rq_unlock(rq, &flags); -} -EXPORT_SYMBOL(set_user_nice); - -/* - * can_nice - check if a task can reduce its nice value - * @p: task - * @nice: nice value - */ -int can_nice(const struct task_struct *p, const int nice) -{ - /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = 20 - nice; - - return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || - capable(CAP_SYS_NICE)); -} - -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -SYSCALL_DEFINE1(nice, int, increment) -{ - long nice, retval; - - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. - */ - if (increment < -40) - increment = -40; - if (increment > 40) - increment = 40; - - nice = PRIO_TO_NICE(current->static_prio) + increment; - if (nice < -20) - nice = -20; - if (nice > 19) - nice = 19; - - if (increment < 0 && !can_nice(current, nice)) - return -EPERM; - - retval = security_task_setnice(current, nice); - if (retval) - return retval; - - set_user_nice(current, nice); - return 0; -} - -#endif - -/** - * task_prio - return the priority value of a given task. - * @p: the task in question. - * - * This is the priority value as seen by users in /proc. - * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from -16 to +15. - */ -int task_prio(const struct task_struct *p) -{ - return p->prio - MAX_RT_PRIO; -} - -/** - * task_nice - return the nice value of a given task. - * @p: the task in question. - */ -int task_nice(const struct task_struct *p) -{ - return TASK_NICE(p); -} -EXPORT_SYMBOL(task_nice); - -/** - * idle_cpu - is a given cpu idle currently? - * @cpu: the processor in question. - */ -int idle_cpu(int cpu) -{ - return cpu_curr(cpu) == cpu_rq(cpu)->idle; -} - -/** - * idle_task - return the idle task for a given cpu. - * @cpu: the processor in question. - */ -struct task_struct *idle_task(int cpu) -{ - return cpu_rq(cpu)->idle; -} - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. - */ -static struct task_struct *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_vpid(pid) : current; -} - -/* Actually do priority change: must hold rq lock. */ -static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) -{ - BUG_ON(p->se.on_rq); - - p->policy = policy; - switch (p->policy) { - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - p->sched_class = &fair_sched_class; - break; - case SCHED_FIFO: - case SCHED_RR: - p->sched_class = &rt_sched_class; - break; - } - - p->rt_priority = prio; - p->normal_prio = normal_prio(p); - /* we are holding p->pi_lock already */ - p->prio = rt_mutex_getprio(p); - set_load_weight(p); -} -#endif - -/* - * check the target process has a UID that matches the current process's - */ -static bool check_same_owner(struct task_struct *p) -{ - const struct cred *cred = current_cred(), *pcred; - bool match; - - rcu_read_lock(); - pcred = __task_cred(p); - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); - rcu_read_unlock(); - return match; -} - -static int __sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param, bool user) -{ -#ifndef DDE_LINUX - int retval, oldprio, oldpolicy = -1, on_rq, running; - unsigned long flags; - const struct sched_class *prev_class = p->sched_class; - struct rq *rq; - - /* may grab non-irq protected spin_locks */ - BUG_ON(in_interrupt()); -recheck: - /* double check policy once rq lock held */ - if (policy < 0) - policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, - * SCHED_BATCH and SCHED_IDLE is 0. - */ - if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) - return -EINVAL; - if (rt_policy(policy) != (param->sched_priority != 0)) - return -EINVAL; - - /* - * Allow unprivileged RT tasks to decrease priority: - */ - if (user && !capable(CAP_SYS_NICE)) { - if (rt_policy(policy)) { - unsigned long rlim_rtprio; - - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; - unlock_task_sighand(p, &flags); - - /* can't set/change the rt policy */ - if (policy != p->policy && !rlim_rtprio) - return -EPERM; - - /* can't increase priority */ - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) - return -EPERM; - } - /* - * Like positive nice levels, dont allow tasks to - * move out of SCHED_IDLE either: - */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) - return -EPERM; - - /* can't change other user's priorities */ - if (!check_same_owner(p)) - return -EPERM; - } - - if (user) { -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; -#endif - - retval = security_task_setscheduler(p, policy, param); - if (retval) - return retval; - } - - /* - * make sure no PI-waiters arrive (or leave) while we are - * changing the priority of the task: - */ - spin_lock_irqsave(&p->pi_lock, flags); - /* - * To be able to change p->policy safely, the apropriate - * runqueue lock must be held. - */ - rq = __task_rq_lock(p); - /* recheck policy now with rq lock held */ - if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { - policy = oldpolicy = -1; - __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); - goto recheck; - } - update_rq_clock(rq); - on_rq = p->se.on_rq; - running = task_current(rq, p); - if (on_rq) - deactivate_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - - oldprio = p->prio; - __setscheduler(rq, p, policy, param->sched_priority); - - if (running) - p->sched_class->set_curr_task(rq); - if (on_rq) { - activate_task(rq, p, 0); - - check_class_changed(rq, p, prev_class, oldprio, running); - } - __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); - - rt_mutex_adjust_pi(p); - - return 0; -#else - //WARN_UNIMPL; - return 0; -#endif -} - -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) -{ - return __sched_setscheduler(p, policy, param, true); -} -EXPORT_SYMBOL_GPL(sched_setscheduler); - -#ifndef DDE_LINUX - -/** - * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Just like sched_setscheduler, only don't bother checking if the - * current context has permission. For example, this is needed in - * stop_machine(): we create temporary high priority worker threads, - * but our caller might not have that capability. - */ -int sched_setscheduler_nocheck(struct task_struct *p, int policy, - struct sched_param *param) -{ - return __sched_setscheduler(p, policy, param, false); -} - -static int -do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -{ - struct sched_param lparam; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - if (copy_from_user(&lparam, param, sizeof(struct sched_param))) - return -EFAULT; - - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setscheduler(p, policy, &lparam); - rcu_read_unlock(); - - return retval; -} - -/** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - */ -SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, - struct sched_param __user *, param) -{ - /* negative values for policy are not valid */ - if (policy < 0) - return -EINVAL; - - return do_sched_setscheduler(pid, policy, param); -} - -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - */ -SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -{ - return do_sched_setscheduler(pid, -1, param); -} - -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - */ -SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -{ - struct task_struct *p; - int retval; - - if (pid < 0) - return -EINVAL; - - retval = -ESRCH; - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy; - } - read_unlock(&tasklist_lock); - return retval; -} - -/** - * sys_sched_getscheduler - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - */ -SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -{ - struct sched_param lp; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - - return retval; - -out_unlock: - read_unlock(&tasklist_lock); - return retval; -} - -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -{ - cpumask_var_t cpus_allowed, new_mask; - struct task_struct *p; - int retval; - - get_online_cpus(); - read_lock(&tasklist_lock); - - p = find_process_by_pid(pid); - if (!p) { - read_unlock(&tasklist_lock); - put_online_cpus(); - return -ESRCH; - } - - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ - get_task_struct(p); - read_unlock(&tasklist_lock); - - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - retval = -EPERM; - if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) - goto out_unlock; - - retval = security_task_setscheduler(p, 0, NULL); - if (retval) - goto out_unlock; - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); - again: - retval = set_cpus_allowed_ptr(p, new_mask); - - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } - } -out_unlock: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); -out_put_task: - put_task_struct(p); - put_online_cpus(); - return retval; -} - -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - struct cpumask *new_mask) -{ - if (len < cpumask_size()) - cpumask_clear(new_mask); - else if (len > cpumask_size()) - len = cpumask_size(); - - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -} - -/** - * sys_sched_setaffinity - set the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new cpu mask - */ -SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - cpumask_var_t new_mask; - int retval; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); - if (retval == 0) - retval = sched_setaffinity(pid, new_mask); - free_cpumask_var(new_mask); - return retval; -} - -long sched_getaffinity(pid_t pid, struct cpumask *mask) -{ - struct task_struct *p; - int retval; - - get_online_cpus(); - read_lock(&tasklist_lock); - - retval = -ESRCH; - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); - -out_unlock: - read_unlock(&tasklist_lock); - put_online_cpus(); - - return retval; -} - -/** - * sys_sched_getaffinity - get the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current cpu mask - */ -SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - int ret; - cpumask_var_t mask; - - if (len < cpumask_size()) - return -EINVAL; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - ret = sched_getaffinity(pid, mask); - if (ret == 0) { - if (copy_to_user(user_mask_ptr, mask, cpumask_size())) - ret = -EFAULT; - else - ret = cpumask_size(); - } - free_cpumask_var(mask); - - return ret; -} - -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. If there are no - * other threads running on this CPU then this function will return. - */ -SYSCALL_DEFINE0(sched_yield) -{ - struct rq *rq = this_rq_lock(); - - schedstat_inc(rq, yld_count); - current->sched_class->yield_task(rq); - - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - _raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); - - schedule(); - - return 0; -} -#endif /* !DDE_LINUX */ - -static void __cond_resched(void) -{ -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - __might_sleep(__FILE__, __LINE__); -#endif - /* - * The BKS might be reacquired before we have dropped - * PREEMPT_ACTIVE, which could trigger a second - * cond_resched() call. - */ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); -} - -int __sched _cond_resched(void) -{ - if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && - system_state == SYSTEM_RUNNING) { - __cond_resched(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(_cond_resched); - -/* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. - * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). - */ -int cond_resched_lock(spinlock_t *lock) -{ - int resched = need_resched() && system_state == SYSTEM_RUNNING; - int ret = 0; - - if (spin_needbreak(lock) || resched) { - spin_unlock(lock); - if (resched && need_resched()) - __cond_resched(); - else - cpu_relax(); - ret = 1; - spin_lock(lock); - } - return ret; -} -EXPORT_SYMBOL(cond_resched_lock); - -int __sched cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (need_resched() && system_state == SYSTEM_RUNNING) { - local_bh_enable(); - __cond_resched(); - local_bh_disable(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(cond_resched_softirq); - -#ifndef DDE_LINUX -/** - * yield - yield the current processor to other threads. - * - * This is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - sys_sched_yield(); -} -EXPORT_SYMBOL(yield); - -/* - * This task is about to go to sleep on IO. Increment rq->nr_iowait so - * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle) - */ -void __sched io_schedule(void) -{ - struct rq *rq = &__raw_get_cpu_var(runqueues); - - delayacct_blkio_start(); - atomic_inc(&rq->nr_iowait); - schedule(); - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); -} -EXPORT_SYMBOL(io_schedule); - -long __sched io_schedule_timeout(long timeout) -{ - struct rq *rq = &__raw_get_cpu_var(runqueues); - long ret; - - delayacct_blkio_start(); - atomic_inc(&rq->nr_iowait); - ret = schedule_timeout(timeout); - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); - return ret; -} - -/** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * this syscall returns the maximum rt_priority that can be used - * by a given scheduling class. - */ -SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; - break; - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. - * - * this syscall returns the minimum rt_priority that can be used - * by a given scheduling class. - */ -SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - } - return ret; -} - -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) -{ - struct task_struct *p; - unsigned int time_slice; - int retval; - struct timespec t; - - if (pid < 0) - return -EINVAL; - - retval = -ESRCH; - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - /* - * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER - * tasks that are on an otherwise idle runqueue: - */ - time_slice = 0; - if (p->policy == SCHED_RR) { - time_slice = DEF_TIMESLICE; - } else if (p->policy != SCHED_FIFO) { - struct sched_entity *se = &p->se; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(p, &flags); - if (rq->cfs.load.weight) - time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); - } - read_unlock(&tasklist_lock); - jiffies_to_timespec(time_slice, &t); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - return retval; - -out_unlock: - read_unlock(&tasklist_lock); - return retval; -} - -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - -void sched_show_task(struct task_struct *p) -{ - unsigned long free = 0; - unsigned state; - - state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif -#ifdef CONFIG_DEBUG_STACK_USAGE - { - unsigned long *n = end_of_stack(p); - while (!*n) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(p); - } -#endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); - - show_stack(p, NULL); -} - -void show_state_filter(unsigned long state_filter) -{ - struct task_struct *g, *p; - -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif - read_lock(&tasklist_lock); - do_each_thread(g, p) { - /* - * reset the NMI-timeout, listing all files on a slow - * console might take alot of time: - */ - touch_nmi_watchdog(); - if (!state_filter || (p->state & state_filter)) - sched_show_task(p); - } while_each_thread(g, p); - - touch_all_softlockup_watchdogs(); - -#ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_show(); -#endif - read_unlock(&tasklist_lock); - /* - * Only show locks if all tasks are dumped: - */ - if (state_filter == -1) - debug_show_all_locks(); -} - -void __cpuinit init_idle_bootup_task(struct task_struct *idle) -{ - idle->sched_class = &idle_sched_class; -} - -/** - * init_idle - set up an idle thread for a given CPU - * @idle: task in question - * @cpu: cpu the idle task belongs to - * - * NOTE: this function does not set the idle thread's NEED_RESCHED - * flag, to make booting more robust. - */ -void __cpuinit init_idle(struct task_struct *idle, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - - __sched_fork(idle); - idle->se.exec_start = sched_clock(); - - idle->prio = idle->normal_prio = MAX_PRIO; - cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); - __set_task_cpu(idle, cpu); - - rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - idle->oncpu = 1; -#endif - spin_unlock_irqrestore(&rq->lock, flags); - - /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) - task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); -#else - task_thread_info(idle)->preempt_count = 0; -#endif - /* - * The idle tasks have their own, simple scheduling class: - */ - idle->sched_class = &idle_sched_class; - ftrace_graph_init_task(idle); -} -#endif /* DDE_LINUX */ - -/* - * In a system that switches off the HZ timer nohz_cpu_mask - * indicates which cpus entered this state. This is used - * in the rcu update to wait only for active cpus. For system - * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_BITS_NONE. - */ -cpumask_var_t nohz_cpu_mask; - -#ifndef DDE_LINUX -/* - * Increase the granularity value when there are more CPUs, - * because with more CPUs the 'effective latency' as visible - * to users decreases. But the relationship is not linear, - * so pick a second-best guess by going with the log2 of the - * number of CPUs. - * - * This idea comes from the SD scheduler of Con Kolivas: - */ -static inline void sched_init_granularity(void) -{ - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long limit = 200000000; - - sysctl_sched_min_granularity *= factor; - if (sysctl_sched_min_granularity > limit) - sysctl_sched_min_granularity = limit; - - sysctl_sched_latency *= factor; - if (sysctl_sched_latency > limit) - sysctl_sched_latency = limit; - - sysctl_sched_wakeup_granularity *= factor; - - sysctl_sched_shares_ratelimit *= factor; -} - -#ifdef CONFIG_SMP -/* - * This is how migration works: - * - * 1) we queue a struct migration_req structure in the source CPU's - * runqueue and wake up that CPU's migration thread. - * 2) we down() the locked semaphore => thread blocks. - * 3) migration thread wakes up (implicitly it forces the migrated - * thread off the CPU) - * 4) it gets the migration request and checks whether the migrated - * task is still in the wrong runqueue. - * 5) if it's in the wrong runqueue then the migration thread removes - * it and puts it into the right queue. - * 6) migration thread up()s the semaphore. - * 7) we wake up and the migration is done. - */ - -/* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. - */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - struct migration_req req; - unsigned long flags; - struct rq *rq; - int ret = 0; - - rq = task_rq_lock(p, &flags); - if (!cpumask_intersects(new_mask, cpu_online_mask)) { - ret = -EINVAL; - goto out; - } - - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpumask_equal(&p->cpus_allowed, new_mask))) { - ret = -EINVAL; - goto out; - } - - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - else { - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); - } - - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; - - if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, &flags); - wake_up_process(rq->migration_thread); - wait_for_completion(&req.done); - tlb_migrate_finish(p->mm); - return 0; - } -out: - task_rq_unlock(rq, &flags); - - return ret; -} -EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); - -/* - * Move (not current) task off this cpu, onto dest cpu. We're doing - * this because either it can't run here any more (set_cpus_allowed() - * away from this CPU, or CPU going down), or because we're - * attempting to rebalance this task on exec (sched_exec). - * - * So we race with normal scheduler movements, but that's OK, as long - * as the task is no longer on this CPU. - * - * Returns non-zero if task was successfully migrated. - */ -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) -{ - struct rq *rq_dest, *rq_src; - int ret = 0, on_rq; - - if (unlikely(!cpu_active(dest_cpu))) - return ret; - - rq_src = cpu_rq(src_cpu); - rq_dest = cpu_rq(dest_cpu); - - double_rq_lock(rq_src, rq_dest); - /* Already moved. */ - if (task_cpu(p) != src_cpu) - goto done; - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) - goto fail; - - on_rq = p->se.on_rq; - if (on_rq) - deactivate_task(rq_src, p, 0); - - set_task_cpu(p, dest_cpu); - if (on_rq) { - activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p, 0); - } -done: - ret = 1; -fail: - double_rq_unlock(rq_src, rq_dest); - return ret; -} - -/* - * migration_thread - this is a highprio system thread that performs - * thread migration by bumping thread off CPU then 'pushing' onto - * another runqueue. - */ -static int migration_thread(void *data) -{ - int cpu = (long)data; - struct rq *rq; - - rq = cpu_rq(cpu); - BUG_ON(rq->migration_thread != current); - - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - struct migration_req *req; - struct list_head *head; - - spin_lock_irq(&rq->lock); - - if (cpu_is_offline(cpu)) { - spin_unlock_irq(&rq->lock); - goto wait_to_die; - } - - if (rq->active_balance) { - active_load_balance(rq, cpu); - rq->active_balance = 0; - } - - head = &rq->migration_queue; - - if (list_empty(head)) { - spin_unlock_irq(&rq->lock); - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - continue; - } - req = list_entry(head->next, struct migration_req, list); - list_del_init(head->next); - - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); - local_irq_enable(); - - complete(&req->done); - } - __set_current_state(TASK_RUNNING); - return 0; - -wait_to_die: - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) -{ - int ret; - - local_irq_disable(); - ret = __migrate_task(p, src_cpu, dest_cpu); - local_irq_enable(); - return ret; -} - -/* - * Figure out where task on dead CPU should go, use force if necessary. - */ -static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) -{ - int dest_cpu; - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); - -again: - /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) - goto move; - - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); - if (dest_cpu < nr_cpu_ids) - goto move; - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } - } - -move: - /* It can have affinity changed while we were choosing. */ - if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) - goto again; -} - -/* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); - unsigned long flags; - - local_irq_save(flags); - double_rq_lock(rq_src, rq_dest); - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; - double_rq_unlock(rq_src, rq_dest); - local_irq_restore(flags); -} - -/* Run through task list and migrate tasks from the dead cpu. */ -static void migrate_live_tasks(int src_cpu) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - - do_each_thread(t, p) { - if (p == current) - continue; - - if (task_cpu(p) == src_cpu) - move_task_off_dead_cpu(src_cpu, p); - } while_each_thread(t, p); - - read_unlock(&tasklist_lock); -} - -/* - * Schedules idle task to be the next runnable task on current CPU. - * It does so by boosting its priority to highest possible. - * Used by CPU offline code. - */ -void sched_idle_next(void) -{ - int this_cpu = smp_processor_id(); - struct rq *rq = cpu_rq(this_cpu); - struct task_struct *p = rq->idle; - unsigned long flags; - - /* cpu has to be offline */ - BUG_ON(cpu_online(this_cpu)); - - /* - * Strictly not necessary since rest of the CPUs are stopped by now - * and interrupts disabled on the current cpu. - */ - spin_lock_irqsave(&rq->lock, flags); - - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - - update_rq_clock(rq); - activate_task(rq, p, 0); - - spin_unlock_irqrestore(&rq->lock, flags); -} - -/* - * Ensures that the idle task is using init_mm right before its cpu goes - * offline. - */ -void idle_task_exit(void) -{ - struct mm_struct *mm = current->active_mm; - - BUG_ON(cpu_online(smp_processor_id())); - - if (mm != &init_mm) - switch_mm(mm, &init_mm, current); - mmdrop(mm); -} - -/* called under rq->lock with disabled interrupts */ -static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) -{ - struct rq *rq = cpu_rq(dead_cpu); - - /* Must be exiting, otherwise would be on tasklist. */ - BUG_ON(!p->exit_state); - - /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->state == TASK_DEAD); - - get_task_struct(p); - - /* - * Drop lock around migration; if someone else moves it, - * that's OK. No task can be added to this CPU, so iteration is - * fine. - */ - spin_unlock_irq(&rq->lock); - move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); - - put_task_struct(p); -} - -/* release_task() removes task from tasklist, so we won't find dead tasks. */ -static void migrate_dead_tasks(unsigned int dead_cpu) -{ - struct rq *rq = cpu_rq(dead_cpu); - struct task_struct *next; - - for ( ; ; ) { - if (!rq->nr_running) - break; - update_rq_clock(rq); - next = pick_next_task(rq, rq->curr); - if (!next) - break; - next->sched_class->put_prev_task(rq, next); - migrate_dead(dead_cpu, next); - - } -} -#endif /* CONFIG_HOTPLUG_CPU */ - -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {0, }, -}; - -static struct ctl_table sd_ctl_root[] = { - { - .ctl_name = CTL_KERN, - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {0, }, -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - mode_t mode, proc_handler *proc_handler) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(13); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[11], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring); - /* &table[12] is terminator */ - - return table; -} - -static ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; -} - -static struct ctl_table_header *sd_sysctl_header; -static void register_sched_domain_sysctl(void) -{ - int i, cpu_num = num_online_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); - char buf[32]; - - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; - - if (entry == NULL) - return; - - for_each_online_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; - } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); -} - -/* may be called multiple times per register */ -static void unregister_sched_domain_sysctl(void) -{ - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); -} -#else -static void register_sched_domain_sysctl(void) -{ -} -static void unregister_sched_domain_sysctl(void) -{ -} -#endif - -static void set_rq_online(struct rq *rq) -{ - if (!rq->online) { - const struct sched_class *class; - - cpumask_set_cpu(rq->cpu, rq->rd->online); - rq->online = 1; - - for_each_class(class) { - if (class->rq_online) - class->rq_online(rq); - } - } -} - -static void set_rq_offline(struct rq *rq) -{ - if (rq->online) { - const struct sched_class *class; - - for_each_class(class) { - if (class->rq_offline) - class->rq_offline(rq); - } - - cpumask_clear_cpu(rq->cpu, rq->rd->online); - rq->online = 0; - } -} - -/* - * migration_call - callback that gets triggered when a CPU is added. - * Here we can start up the necessary migration thread for the new CPU. - */ -static int __cpuinit -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - struct task_struct *p; - int cpu = (long)hcpu; - unsigned long flags; - struct rq *rq; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); - if (IS_ERR(p)) - return NOTIFY_BAD; - kthread_bind(p, cpu); - /* Must be high prio: stop_machine expects to yield to it. */ - rq = task_rq_lock(p, &flags); - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - task_rq_unlock(rq, &flags); - cpu_rq(cpu)->migration_thread = p; - break; - - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - /* Strictly unnecessary, as first user will wake it. */ - wake_up_process(cpu_rq(cpu)->migration_thread); - - /* Update our root-domain */ - rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - - set_rq_online(rq); - } - spin_unlock_irqrestore(&rq->lock, flags); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!cpu_rq(cpu)->migration_thread) - break; - /* Unbind it from offline cpu so it can run. Fall thru. */ - kthread_bind(cpu_rq(cpu)->migration_thread, - cpumask_any(cpu_online_mask)); - kthread_stop(cpu_rq(cpu)->migration_thread); - cpu_rq(cpu)->migration_thread = NULL; - break; - - case CPU_DEAD: - case CPU_DEAD_FROZEN: - cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ - migrate_live_tasks(cpu); - rq = cpu_rq(cpu); - kthread_stop(rq->migration_thread); - rq->migration_thread = NULL; - /* Idle task back to normal (off runqueue, low prio) */ - spin_lock_irq(&rq->lock); - update_rq_clock(rq); - deactivate_task(rq, rq->idle, 0); - rq->idle->static_prio = MAX_PRIO; - __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); - rq->idle->sched_class = &idle_sched_class; - migrate_dead_tasks(cpu); - spin_unlock_irq(&rq->lock); - cpuset_unlock(); - migrate_nr_uninterruptible(rq); - BUG_ON(rq->nr_running != 0); - - /* - * No need to migrate the tasks: it was best-effort if - * they didn't take sched_hotcpu_mutex. Just wake up - * the requestors. - */ - spin_lock_irq(&rq->lock); - while (!list_empty(&rq->migration_queue)) { - struct migration_req *req; - - req = list_entry(rq->migration_queue.next, - struct migration_req, list); - list_del_init(&req->list); - spin_unlock_irq(&rq->lock); - complete(&req->done); - spin_lock_irq(&rq->lock); - } - spin_unlock_irq(&rq->lock); - break; - - case CPU_DYING: - case CPU_DYING_FROZEN: - /* Update our root-domain */ - rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - spin_unlock_irqrestore(&rq->lock, flags); - break; -#endif - } - return NOTIFY_OK; -} - -/* Register at highest priority so that task migration (migrate_all_tasks) - * happens before everything else. - */ -static struct notifier_block __cpuinitdata migration_notifier = { - .notifier_call = migration_call, - .priority = 10 -}; - -static int __init migration_init(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - /* Start one for the boot CPU: */ - err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); - migration_call(&migration_notifier, CPU_ONLINE, cpu); - register_cpu_notifier(&migration_notifier); - - return err; -} -early_initcall(migration_init); -#endif - -#ifdef CONFIG_SMP - -#ifdef CONFIG_SCHED_DEBUG - -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - struct cpumask *groupmask) -{ - struct sched_group *group = sd->groups; - char str[256]; - - cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); - cpumask_clear(groupmask); - - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); - return -1; - } - - printk(KERN_CONT "span %s level %s\n", str, sd->name); - - if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); - } - if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); - } - - printk(KERN_DEBUG "%*s groups:", level + 1, ""); - do { - if (!group) { - printk("\n"); - printk(KERN_ERR "ERROR: group is NULL\n"); - break; - } - - if (!group->__cpu_power) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); - break; - } - - if (!cpumask_weight(sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: empty group\n"); - break; - } - - if (cpumask_intersects(groupmask, sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: repeated CPUs\n"); - break; - } - - cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - - cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - printk(KERN_CONT " %s", str); - - group = group->next; - } while (group != sd->groups); - printk(KERN_CONT "\n"); - - if (!cpumask_equal(sched_domain_span(sd), groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - - if (sd->parent && - !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); - return 0; -} - -static void sched_domain_debug(struct sched_domain *sd, int cpu) -{ - cpumask_var_t groupmask; - int level = 0; - - if (!sd) { - printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); - return; - } - - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - - if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { - printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); - return; - } - - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, groupmask)) - break; - level++; - sd = sd->parent; - if (!sd) - break; - } - free_cpumask_var(groupmask); -} -#else /* !CONFIG_SCHED_DEBUG */ -# define sched_domain_debug(sd, cpu) do { } while (0) -#endif /* CONFIG_SCHED_DEBUG */ - -static int sd_degenerate(struct sched_domain *sd) -{ - if (cpumask_weight(sched_domain_span(sd)) == 1) - return 1; - - /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES)) { - if (sd->groups != sd->groups->next) - return 0; - } - - /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_IDLE | - SD_WAKE_AFFINE | - SD_WAKE_BALANCE)) - return 0; - - return 1; -} - -static int -sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) -{ - unsigned long cflags = sd->flags, pflags = parent->flags; - - if (sd_degenerate(parent)) - return 1; - - if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) - return 0; - - /* Does parent contain flags not in child? */ - /* WAKE_BALANCE is a subset of WAKE_AFFINE */ - if (cflags & SD_WAKE_AFFINE) - pflags &= ~SD_WAKE_BALANCE; - /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES); - if (nr_node_ids == 1) - pflags &= ~SD_SERIALIZE; - } - if (~cflags & pflags) - return 0; - - return 1; -} - -static void free_rootdomain(struct root_domain *rd) -{ - cpupri_cleanup(&rd->cpupri); - - free_cpumask_var(rd->rto_mask); - free_cpumask_var(rd->online); - free_cpumask_var(rd->span); - kfree(rd); -} - -static void rq_attach_root(struct rq *rq, struct root_domain *rd) -{ - struct root_domain *old_rd = NULL; - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - - if (rq->rd) { - old_rd = rq->rd; - - if (cpumask_test_cpu(rq->cpu, old_rd->online)) - set_rq_offline(rq); - - cpumask_clear_cpu(rq->cpu, old_rd->span); - - /* - * If we dont want to free the old_rt yet then - * set old_rd to NULL to skip the freeing later - * in this function: - */ - if (!atomic_dec_and_test(&old_rd->refcount)) - old_rd = NULL; - } - - atomic_inc(&rd->refcount); - rq->rd = rd; - - cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) - set_rq_online(rq); - - spin_unlock_irqrestore(&rq->lock, flags); - - if (old_rd) - free_rootdomain(old_rd); -} - -static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) -{ - memset(rd, 0, sizeof(*rd)); - - if (bootmem) { - alloc_bootmem_cpumask_var(&def_root_domain.span); - alloc_bootmem_cpumask_var(&def_root_domain.online); - alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); - cpupri_init(&rd->cpupri, true); - return 0; - } - - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) - goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) - goto free_online; - - if (cpupri_init(&rd->cpupri, false) != 0) - goto free_rto_mask; - return 0; - -free_rto_mask: - free_cpumask_var(rd->rto_mask); -free_online: - free_cpumask_var(rd->online); -free_span: - free_cpumask_var(rd->span); -out: - return -ENOMEM; -} - -static void init_defrootdomain(void) -{ - init_rootdomain(&def_root_domain, true); - - atomic_set(&def_root_domain.refcount, 1); -} - -static struct root_domain *alloc_rootdomain(void) -{ - struct root_domain *rd; - - rd = kmalloc(sizeof(*rd), GFP_KERNEL); - if (!rd) - return NULL; - - if (init_rootdomain(rd, false) != 0) { - kfree(rd); - return NULL; - } - - return rd; -} - -/* - * Attach the domain 'sd' to 'cpu' as its base domain. Callers must - * hold the hotplug lock. - */ -static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; - - /* Remove the sched domains which do not contribute to scheduling. */ - for (tmp = sd; tmp; ) { - struct sched_domain *parent = tmp->parent; - if (!parent) - break; - - if (sd_parent_degenerate(tmp, parent)) { - tmp->parent = parent->parent; - if (parent->parent) - parent->parent->child = tmp; - } else - tmp = tmp->parent; - } - - if (sd && sd_degenerate(sd)) { - sd = sd->parent; - if (sd) - sd->child = NULL; - } - - sched_domain_debug(sd, cpu); - - rq_attach_root(rq, rd); - rcu_assign_pointer(rq->sd, sd); -} - -/* cpus with isolated domains */ -static cpumask_var_t cpu_isolated_map; - -/* Setup the mask of cpus configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - cpulist_parse(str, cpu_isolated_map); - return 1; -} - -__setup("isolcpus=", isolated_cpu_setup); - -/* - * init_sched_build_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * - * init_sched_build_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. - */ -static void -init_sched_build_groups(const struct cpumask *span, - const struct cpumask *cpu_map, - int (*group_fn)(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *tmpmask), - struct cpumask *covered, struct cpumask *tmpmask) -{ - struct sched_group *first = NULL, *last = NULL; - int i; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg, tmpmask); - int j; - - if (cpumask_test_cpu(i, covered)) - continue; - - cpumask_clear(sched_group_cpus(sg)); - sg->__cpu_power = 0; - - for_each_cpu(j, span) { - if (group_fn(j, cpu_map, NULL, tmpmask) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; -} - -#define SD_NODES_PER_DOMAIN 16 - -#ifdef CONFIG_NUMA - -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int find_next_best_node(int node, nodemask_t *used_nodes) -{ - int i, n, val, min_val, best_node = 0; - - min_val = INT_MAX; - - for (i = 0; i < nr_node_ids; i++) { - /* Start at @node */ - n = (node + i) % nr_node_ids; - - if (!nr_cpus_node(n)) - continue; - - /* Skip already used nodes */ - if (node_isset(n, *used_nodes)) - continue; - - /* Simple min distance search */ - val = node_distance(node, n); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - node_set(best_node, *used_nodes); - return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @span: resulting cpumask - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -static void sched_domain_node_span(int node, struct cpumask *span) -{ - nodemask_t used_nodes; - int i; - - cpumask_clear(span); - nodes_clear(used_nodes); - - cpumask_or(span, span, cpumask_of_node(node)); - node_set(node, used_nodes); - - for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, &used_nodes); - - cpumask_or(span, span, cpumask_of_node(next_node)); - } -} -#endif /* CONFIG_NUMA */ - -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; - -/* - * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. - */ -struct static_sched_group { - struct sched_group sg; - DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); -}; - -struct static_sched_domain { - struct sched_domain sd; - DECLARE_BITMAP(span, CONFIG_NR_CPUS); -}; - -/* - * SMT sched-domains: - */ -#ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); - -static int -cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) -{ - if (sg) - *sg = &per_cpu(sched_group_cpus, cpu).sg; - return cpu; -} -#endif /* CONFIG_SCHED_SMT */ - -/* - * multi-core sched-domains: - */ -#ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct static_sched_domain, core_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); -#endif /* CONFIG_SCHED_MC */ - -#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) -static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group; - - cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); - group = cpumask_first(mask); - if (sg) - *sg = &per_cpu(sched_group_core, group).sg; - return group; -} -#elif defined(CONFIG_SCHED_MC) -static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) -{ - if (sg) - *sg = &per_cpu(sched_group_core, cpu).sg; - return cpu; -} -#endif - -static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); - -static int -cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group; -#ifdef CONFIG_SCHED_MC - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); - group = cpumask_first(mask); -#else - group = cpu; -#endif - if (sg) - *sg = &per_cpu(sched_group_phys, group).sg; - return group; -} - -#ifdef CONFIG_NUMA -/* - * The init_sched_build_groups can't handle what we want to do with node - * groups, so roll our own. Now each node has its own list of groups which - * gets dynamically allocated. - */ -static DEFINE_PER_CPU(struct static_sched_domain, node_domains); -static struct sched_group ***sched_group_nodes_bycpu; - -static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); - -static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *nodemask) -{ - int group; - - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); - group = cpumask_first(nodemask); - - if (sg) - *sg = &per_cpu(sched_group_allnodes, group).sg; - return group; -} - -static void init_numa_sched_groups_power(struct sched_group *group_head) -{ - struct sched_group *sg = group_head; - int j; - - if (!sg) - return; - do { - for_each_cpu(j, sched_group_cpus(sg)) { - struct sched_domain *sd; - - sd = &per_cpu(phys_domains, j).sd; - if (j != cpumask_first(sched_group_cpus(sd->groups))) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } - - sg_inc_cpu_power(sg, sd->groups->__cpu_power); - } - sg = sg->next; - } while (sg != group_head); -} -#endif /* CONFIG_NUMA */ - -#ifdef CONFIG_NUMA -/* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ - int cpu, i; - - for_each_cpu(cpu, cpu_map) { - struct sched_group **sched_group_nodes - = sched_group_nodes_bycpu[cpu]; - - if (!sched_group_nodes) - continue; - - for (i = 0; i < nr_node_ids; i++) { - struct sched_group *oldsg, *sg = sched_group_nodes[i]; - - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) - continue; - - if (sg == NULL) - continue; - sg = sg->next; -next_sg: - oldsg = sg; - sg = sg->next; - kfree(oldsg); - if (oldsg != sched_group_nodes[i]) - goto next_sg; - } - kfree(sched_group_nodes); - sched_group_nodes_bycpu[cpu] = NULL; - } -} -#else /* !CONFIG_NUMA */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ -} -#endif /* CONFIG_NUMA */ - -/* - * Initialize sched groups cpu_power. - * - * cpu_power indicates the capacity of sched group, which is used while - * distributing the load between different sched groups in a sched domain. - * Typically cpu_power for all the groups in a sched domain will be same unless - * there are asymmetries in the topology. If there are asymmetries, group - * having more cpu_power will pickup more load compared to the group having - * less cpu_power. - * - * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents - * the maximum number of tasks a group can handle in the presence of other idle - * or lightly loaded groups in the same sched domain. - */ -static void init_sched_groups_power(int cpu, struct sched_domain *sd) -{ - struct sched_domain *child; - struct sched_group *group; - - WARN_ON(!sd || !sd->groups); - - if (cpu != cpumask_first(sched_group_cpus(sd->groups))) - return; - - child = sd->child; - - sd->groups->__cpu_power = 0; - - /* - * For perf policy, if the groups in child domain share resources - * (for example cores sharing some portions of the cache hierarchy - * or SMT), then set this domain groups cpu_power such that each group - * can handle only one task, when there are other idle groups in the - * same sched domain. - */ - if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && - (child->flags & - (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { - sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); - return; - } - - /* - * add cpu_power of each child group to this groups cpu_power - */ - group = child->groups; - do { - sg_inc_cpu_power(sd->groups, group->__cpu_power); - group = group->next; - } while (group != child->groups); -} - -/* - * Initializers for schedule domains - * Non-inlined to reduce accumulated stack pressure in build_sched_domains() - */ - -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type) sd->name = #type -#else -# define SD_INIT_NAME(sd, type) do { } while (0) -#endif - -#define SD_INIT(sd, type) sd_init_##type(sd) - -#define SD_INIT_FUNC(type) \ -static noinline void sd_init_##type(struct sched_domain *sd) \ -{ \ - memset(sd, 0, sizeof(*sd)); \ - *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ - SD_INIT_NAME(sd, type); \ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_NUMA - SD_INIT_FUNC(ALLNODES) - SD_INIT_FUNC(NODE) -#endif -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif - -static int default_relax_domain_level = -1; - -static int __init setup_relax_domain_level(char *str) -{ - unsigned long val; - - val = simple_strtoul(str, NULL, 0); - if (val < SD_LV_MAX) - default_relax_domain_level = val; - - return 1; -} -__setup("relax_domain_level=", setup_relax_domain_level); - -static void set_domain_attribute(struct sched_domain *sd, - struct sched_domain_attr *attr) -{ - int request; - - if (!attr || attr->relax_domain_level < 0) { - if (default_relax_domain_level < 0) - return; - else - request = default_relax_domain_level; - } else - request = attr->relax_domain_level; - if (request < sd->level) { - /* turn off idle balance on this domain */ - sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); - } else { - /* turn on idle balance on this domain */ - sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); - } -} - -/* - * Build sched domains for a given set of cpus and attach the sched domains - * to the individual cpus - */ -static int __build_sched_domains(const struct cpumask *cpu_map, - struct sched_domain_attr *attr) -{ - int i, err = -ENOMEM; - struct root_domain *rd; - cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, - tmpmask; -#ifdef CONFIG_NUMA - cpumask_var_t domainspan, covered, notcovered; - struct sched_group **sched_group_nodes = NULL; - int sd_allnodes = 0; - - if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&covered, GFP_KERNEL)) - goto free_domainspan; - if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) - goto free_covered; -#endif - - if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) - goto free_notcovered; - if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) - goto free_nodemask; - if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) - goto free_this_sibling_map; - if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) - goto free_this_core_map; - if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) - goto free_send_covered; - -#ifdef CONFIG_NUMA - /* - * Allocate the per-node list of sched groups - */ - sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), - GFP_KERNEL); - if (!sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - goto free_tmpmask; - } -#endif - - rd = alloc_rootdomain(); - if (!rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); - goto free_sched_groups; - } - -#ifdef CONFIG_NUMA - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; -#endif - - /* - * Set up domains for cpus specified by the cpu_map. - */ - for_each_cpu(i, cpu_map) { - struct sched_domain *sd = NULL, *p; - - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); - -#ifdef CONFIG_NUMA - if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { - sd = &per_cpu(allnodes_domains, i).sd; - SD_INIT(sd, ALLNODES); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); - p = sd; - sd_allnodes = 1; - } else - p = NULL; - - sd = &per_cpu(node_domains, i).sd; - SD_INIT(sd, NODE); - set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); - sd->parent = p; - if (p) - p->child = sd; - cpumask_and(sched_domain_span(sd), - sched_domain_span(sd), cpu_map); -#endif - - p = sd; - sd = &per_cpu(phys_domains, i).sd; - SD_INIT(sd, CPU); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), nodemask); - sd->parent = p; - if (p) - p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); - -#ifdef CONFIG_SCHED_MC - p = sd; - sd = &per_cpu(core_domains, i).sd; - SD_INIT(sd, MC); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, - cpu_coregroup_mask(i)); - sd->parent = p; - p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); -#endif - -#ifdef CONFIG_SCHED_SMT - p = sd; - sd = &per_cpu(cpu_domains, i).sd; - SD_INIT(sd, SIBLING); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), - &per_cpu(cpu_sibling_map, i), cpu_map); - sd->parent = p; - p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); -#endif - } - -#ifdef CONFIG_SCHED_SMT - /* Set up CPU (sibling) groups */ - for_each_cpu(i, cpu_map) { - cpumask_and(this_sibling_map, - &per_cpu(cpu_sibling_map, i), cpu_map); - if (i != cpumask_first(this_sibling_map)) - continue; - - init_sched_build_groups(this_sibling_map, cpu_map, - &cpu_to_cpu_group, - send_covered, tmpmask); - } -#endif - -#ifdef CONFIG_SCHED_MC - /* Set up multi-core groups */ - for_each_cpu(i, cpu_map) { - cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); - if (i != cpumask_first(this_core_map)) - continue; - - init_sched_build_groups(this_core_map, cpu_map, - &cpu_to_core_group, - send_covered, tmpmask); - } -#endif - - /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) { - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) - continue; - - init_sched_build_groups(nodemask, cpu_map, - &cpu_to_phys_group, - send_covered, tmpmask); - } - -#ifdef CONFIG_NUMA - /* Set up node groups */ - if (sd_allnodes) { - init_sched_build_groups(cpu_map, cpu_map, - &cpu_to_allnodes_group, - send_covered, tmpmask); - } - - for (i = 0; i < nr_node_ids; i++) { - /* Set up node groups */ - struct sched_group *sg, *prev; - int j; - - cpumask_clear(covered); - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) { - sched_group_nodes[i] = NULL; - continue; - } - - sched_domain_node_span(i, domainspan); - cpumask_and(domainspan, domainspan, cpu_map); - - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for " - "node %d\n", i); - goto error; - } - sched_group_nodes[i] = sg; - for_each_cpu(j, nodemask) { - struct sched_domain *sd; - - sd = &per_cpu(node_domains, j).sd; - sd->groups = sg; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), nodemask); - sg->next = sg; - cpumask_or(covered, covered, nodemask); - prev = sg; - - for (j = 0; j < nr_node_ids; j++) { - int n = (i + j) % nr_node_ids; - - cpumask_complement(notcovered, covered); - cpumask_and(tmpmask, notcovered, cpu_map); - cpumask_and(tmpmask, tmpmask, domainspan); - if (cpumask_empty(tmpmask)) - break; - - cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); - if (cpumask_empty(tmpmask)) - continue; - - sg = kmalloc_node(sizeof(struct sched_group) + - cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - goto error; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), tmpmask); - sg->next = prev->next; - cpumask_or(covered, covered, tmpmask); - prev->next = sg; - prev = sg; - } - } -#endif - - /* Calculate CPU power for physical packages and nodes */ -#ifdef CONFIG_SCHED_SMT - for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; - - init_sched_groups_power(i, sd); - } -#endif -#ifdef CONFIG_SCHED_MC - for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i).sd; - - init_sched_groups_power(i, sd); - } -#endif - - for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i).sd; - - init_sched_groups_power(i, sd); - } - -#ifdef CONFIG_NUMA - for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(sched_group_nodes[i]); - - if (sd_allnodes) { - struct sched_group *sg; - - cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, - tmpmask); - init_numa_sched_groups_power(sg); - } -#endif - - /* Attach the domains */ - for_each_cpu(i, cpu_map) { - struct sched_domain *sd; -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i).sd; -#elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i).sd; -#else - sd = &per_cpu(phys_domains, i).sd; -#endif - cpu_attach_domain(sd, rd, i); - } - - err = 0; - -free_tmpmask: - free_cpumask_var(tmpmask); -free_send_covered: - free_cpumask_var(send_covered); -free_this_core_map: - free_cpumask_var(this_core_map); -free_this_sibling_map: - free_cpumask_var(this_sibling_map); -free_nodemask: - free_cpumask_var(nodemask); -free_notcovered: -#ifdef CONFIG_NUMA - free_cpumask_var(notcovered); -free_covered: - free_cpumask_var(covered); -free_domainspan: - free_cpumask_var(domainspan); -out: -#endif - return err; - -free_sched_groups: -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - goto free_tmpmask; - -#ifdef CONFIG_NUMA -error: - free_sched_groups(cpu_map, tmpmask); - free_rootdomain(rd); - goto free_tmpmask; -#endif -} - -static int build_sched_domains(const struct cpumask *cpu_map) -{ - return __build_sched_domains(cpu_map, NULL); -} - -static struct cpumask *doms_cur; /* current sched domains */ -static int ndoms_cur; /* number of sched domains in 'doms_cur' */ -static struct sched_domain_attr *dattr_cur; - /* attribues of custom domains in 'doms_cur' */ - -/* - * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask) fails, then fallback to a single sched domain, - * as determined by the single cpumask fallback_doms. - */ -static cpumask_var_t fallback_doms; - -/* - * arch_update_cpu_topology lets virtualized architectures update the - * cpu core maps. It is supposed to return 1 if the topology changed - * or 0 if it stayed the same. - */ -int __attribute__((weak)) arch_update_cpu_topology(void) -{ - return 0; -} - -/* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ -static int arch_init_sched_domains(const struct cpumask *cpu_map) -{ - int err; - - arch_update_cpu_topology(); - ndoms_cur = 1; - doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); - if (!doms_cur) - doms_cur = fallback_doms; - cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); - dattr_cur = NULL; - err = build_sched_domains(doms_cur); - register_sched_domain_sysctl(); - - return err; -} - -static void arch_destroy_sched_domains(const struct cpumask *cpu_map, - struct cpumask *tmpmask) -{ - free_sched_groups(cpu_map, tmpmask); -} - -/* - * Detach sched domains from a group of cpus specified in cpu_map - * These cpus will now be attached to the NULL domain - */ -static void detach_destroy_domains(const struct cpumask *cpu_map) -{ - /* Save because hotplug lock held. */ - static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); - int i; - - for_each_cpu(i, cpu_map) - cpu_attach_domain(NULL, &def_root_domain, i); - synchronize_sched(); - arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); -} - -/* handle null as "default" */ -static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, - struct sched_domain_attr *new, int idx_new) -{ - struct sched_domain_attr tmp; - - /* fast path */ - if (!new && !cur) - return 1; - - tmp = SD_ATTR_INIT; - return !memcmp(cur ? (cur + idx_cur) : &tmp, - new ? (new + idx_new) : &tmp, - sizeof(struct sched_domain_attr)); -} - -/* - * Partition sched domains as specified by the 'ndoms_new' - * cpumasks in the array doms_new[] of cpumasks. This compares - * doms_new[] to the current sched domain partitioning, doms_cur[]. - * It destroys each deleted domain and builds each new domain. - * - * 'doms_new' is an array of cpumask's of length 'ndoms_new'. - * The masks don't intersect (don't overlap.) We should setup one - * sched domain for each mask. CPUs not in any of the cpumasks will - * not be load balanced. If the same cpumask appears both in the - * current 'doms_cur' domains and in the new 'doms_new', we can leave - * it as it is. - * - * The passed in 'doms_new' should be kmalloc'd. This routine takes - * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL && - * ndoms_new == 1, and partition_sched_domains() will fallback to - * the single partition 'fallback_doms', it also forces the domains - * to be rebuilt. - * - * If doms_new == NULL it will be replaced with cpu_online_mask. - * ndoms_new == 0 is a special case for destroying existing domains, - * and it will not create the default domain. - * - * Call with hotplug lock held - */ -/* FIXME: Change to struct cpumask *doms_new[] */ -void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, - struct sched_domain_attr *dattr_new) -{ - int i, j, n; - int new_topology; - - mutex_lock(&sched_domains_mutex); - - /* always unregister in case we don't destroy any domains */ - unregister_sched_domain_sysctl(); - - /* Let architecture update cpu core mappings. */ - new_topology = arch_update_cpu_topology(); - - n = doms_new ? ndoms_new : 0; - - /* Destroy deleted domains */ - for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(&doms_cur[i], &doms_new[j]) - && dattrs_equal(dattr_cur, i, dattr_new, j)) - goto match1; - } - /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur + i); -match1: - ; - } - - if (doms_new == NULL) { - ndoms_cur = 0; - doms_new = fallback_doms; - cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); - WARN_ON_ONCE(dattr_new); - } - - /* Build new domains */ - for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpumask_equal(&doms_new[i], &doms_cur[j]) - && dattrs_equal(dattr_new, i, dattr_cur, j)) - goto match2; - } - /* no match - add a new doms_new */ - __build_sched_domains(doms_new + i, - dattr_new ? dattr_new + i : NULL); -match2: - ; - } - - /* Remember the new sched domains */ - if (doms_cur != fallback_doms) - kfree(doms_cur); - kfree(dattr_cur); /* kfree(NULL) is safe */ - doms_cur = doms_new; - dattr_cur = dattr_new; - ndoms_cur = ndoms_new; - - register_sched_domain_sysctl(); - - mutex_unlock(&sched_domains_mutex); -} - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void arch_reinit_sched_domains(void) -{ - get_online_cpus(); - - /* Destroy domains first to force the rebuild */ - partition_sched_domains(0, NULL, NULL); - - rebuild_sched_domains(); - put_online_cpus(); -} - -static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) -{ - unsigned int level = 0; - - if (sscanf(buf, "%u", &level) != 1) - return -EINVAL; - - /* - * level is always be positive so don't check for - * level < POWERSAVINGS_BALANCE_NONE which is 0 - * What happens on 0 or 1 byte write, - * need to check for count as well? - */ - - if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) - return -EINVAL; - - if (smt) - sched_smt_power_savings = level; - else - sched_mc_power_savings = level; - - arch_reinit_sched_domains(); - - return count; -} - -#ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, - char *page) -{ - return sprintf(page, "%u\n", sched_mc_power_savings); -} -static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 0); -} -static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, - sched_mc_power_savings_show, - sched_mc_power_savings_store); -#endif - -#ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, - char *page) -{ - return sprintf(page, "%u\n", sched_smt_power_savings); -} -static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 1); -} -static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, - sched_smt_power_savings_show, - sched_smt_power_savings_store); -#endif - -int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) -{ - int err = 0; - -#ifdef CONFIG_SCHED_SMT - if (smt_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_smt_power_savings.attr); -#endif -#ifdef CONFIG_SCHED_MC - if (!err && mc_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_mc_power_savings.attr); -#endif - return err; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - -#ifndef CONFIG_CPUSETS -/* - * Add online and remove offline CPUs from the scheduler domains. - * When cpusets are enabled they take over this function. - */ -static int update_sched_domains(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - partition_sched_domains(1, NULL, NULL); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} -#endif - -static int update_runtime(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - -void __init sched_init_smp(void) -{ - cpumask_var_t non_isolated_cpus; - - alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - -#if defined(CONFIG_NUMA) - sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), - GFP_KERNEL); - BUG_ON(sched_group_nodes_bycpu == NULL); -#endif - get_online_cpus(); - mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_online_mask); - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - if (cpumask_empty(non_isolated_cpus)) - cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); - mutex_unlock(&sched_domains_mutex); - put_online_cpus(); - -#ifndef CONFIG_CPUSETS - /* XXX: Theoretical race here - CPU may be hotplugged now */ - hotcpu_notifier(update_sched_domains, 0); -#endif - - /* RT runtime code needs to handle some hotplug events */ - hotcpu_notifier(update_runtime, 0); - - init_hrtick(); - - /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) - BUG(); - sched_init_granularity(); - free_cpumask_var(non_isolated_cpus); - - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); - init_sched_rt_class(); -} -#else -void __init sched_init_smp(void) -{ - sched_init_granularity(); -} -#endif /* CONFIG_SMP */ - -int in_sched_functions(unsigned long addr) -{ - return in_lock_functions(addr) || - (addr >= (unsigned long)__sched_text_start - && addr < (unsigned long)__sched_text_end); -} - -static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) -{ - cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->rq = rq; -#endif - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -} - -static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) -{ - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - rt_rq->highest_prio = MAX_RT_PRIO; -#endif -#ifdef CONFIG_SMP - rt_rq->rt_nr_migratory = 0; - rt_rq->overloaded = 0; -#endif - - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - spin_lock_init(&rt_rq->rt_runtime_lock); - -#ifdef CONFIG_RT_GROUP_SCHED - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; -#endif -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, int add, - struct sched_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - tg->cfs_rq[cpu] = cfs_rq; - init_cfs_rq(cfs_rq, rq); - cfs_rq->tg = tg; - if (add) - list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); - - tg->se[cpu] = se; - /* se could be NULL for init_task_group */ - if (!se) - return; - - if (!parent) - se->cfs_rq = &rq->cfs; - else - se->cfs_rq = parent->my_q; - - se->my_q = cfs_rq; - se->load.weight = tg->shares; - se->load.inv_weight = 0; - se->parent = parent; -} -#endif - -#ifdef CONFIG_RT_GROUP_SCHED -static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, int add, - struct sched_rt_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - tg->rt_rq[cpu] = rt_rq; - init_rt_rq(rt_rq, rq); - rt_rq->tg = tg; - rt_rq->rt_se = rt_se; - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - if (add) - list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); - - tg->rt_se[cpu] = rt_se; - if (!rt_se) - return; - - if (!parent) - rt_se->rt_rq = &rq->rt; - else - rt_se->rt_rq = parent->my_q; - - rt_se->my_q = rt_rq; - rt_se->parent = parent; - INIT_LIST_HEAD(&rt_se->run_list); -} -#endif - -void __init sched_init(void) -{ - int i, j; - unsigned long alloc_size = 0, ptr; - -#ifdef CONFIG_FAIR_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); -#endif -#ifdef CONFIG_USER_SCHED - alloc_size *= 2; -#endif - /* - * As sched_init() is called before page_alloc is setup, - * we use alloc_bootmem(). - */ - if (alloc_size) { - ptr = (unsigned long)alloc_bootmem(alloc_size); - -#ifdef CONFIG_FAIR_GROUP_SCHED - init_task_group.se = (struct sched_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - init_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - -#ifdef CONFIG_USER_SCHED - root_task_group.se = (struct sched_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); -#endif /* CONFIG_USER_SCHED */ -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_RT_GROUP_SCHED - init_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - init_task_group.rt_rq = (struct rt_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - -#ifdef CONFIG_USER_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.rt_rq = (struct rt_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); -#endif /* CONFIG_USER_SCHED */ -#endif /* CONFIG_RT_GROUP_SCHED */ - } - -#ifdef CONFIG_SMP - init_defrootdomain(); -#endif - - init_rt_bandwidth(&def_rt_bandwidth, - global_rt_period(), global_rt_runtime()); - -#ifdef CONFIG_RT_GROUP_SCHED - init_rt_bandwidth(&init_task_group.rt_bandwidth, - global_rt_period(), global_rt_runtime()); -#ifdef CONFIG_USER_SCHED - init_rt_bandwidth(&root_task_group.rt_bandwidth, - global_rt_period(), RUNTIME_INF); -#endif /* CONFIG_USER_SCHED */ -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_GROUP_SCHED - list_add(&init_task_group.list, &task_groups); - INIT_LIST_HEAD(&init_task_group.children); - -#ifdef CONFIG_USER_SCHED - INIT_LIST_HEAD(&root_task_group.children); - init_task_group.parent = &root_task_group; - list_add(&init_task_group.siblings, &root_task_group.children); -#endif /* CONFIG_USER_SCHED */ -#endif /* CONFIG_GROUP_SCHED */ - - for_each_possible_cpu(i) { - struct rq *rq; - - rq = cpu_rq(i); - spin_lock_init(&rq->lock); - rq->nr_running = 0; - init_cfs_rq(&rq->cfs, rq); - init_rt_rq(&rq->rt, rq); -#ifdef CONFIG_FAIR_GROUP_SCHED - init_task_group.shares = init_task_group_load; - INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); -#ifdef CONFIG_CGROUP_SCHED - /* - * How much cpu bandwidth does init_task_group get? - * - * In case of task-groups formed thr' the cgroup filesystem, it - * gets 100% of the cpu resources in the system. This overall - * system cpu resource is divided among the tasks of - * init_task_group and its child task-groups in a fair manner, - * based on each entity's (task or task-group's) weight - * (se->load.weight). - * - * In other words, if init_task_group has 10 tasks of weight - * 1024) and two child groups A0 and A1 (of weight 1024 each), - * then A0's share of the cpu resource is: - * - * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% - * - * We achieve this by letting init_task_group's tasks sit - * directly in rq->cfs (i.e init_task_group->se[] = NULL). - */ - init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); -#elif defined CONFIG_USER_SCHED - root_task_group.shares = NICE_0_LOAD; - init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); - /* - * In case of task-groups formed thr' the user id of tasks, - * init_task_group represents tasks belonging to root user. - * Hence it forms a sibling of all subsequent groups formed. - * In this case, init_task_group gets only a fraction of overall - * system cpu resource, based on the weight assigned to root - * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished - * by letting tasks of init_task_group sit in a separate cfs_rq - * (init_cfs_rq) and having one entity represent this group of - * tasks in rq->cfs (i.e init_task_group->se[] != NULL). - */ - init_tg_cfs_entry(&init_task_group, - &per_cpu(init_cfs_rq, i), - &per_cpu(init_sched_entity, i), i, 1, - root_task_group.se[i]); - -#endif -#endif /* CONFIG_FAIR_GROUP_SCHED */ - - rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; -#ifdef CONFIG_RT_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_rt_rq_list); -#ifdef CONFIG_CGROUP_SCHED - init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); -#elif defined CONFIG_USER_SCHED - init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); - init_tg_rt_entry(&init_task_group, - &per_cpu(init_rt_rq, i), - &per_cpu(init_sched_rt_entity, i), i, 1, - root_task_group.rt_se[i]); -#endif -#endif - - for (j = 0; j < CPU_LOAD_IDX_MAX; j++) - rq->cpu_load[j] = 0; -#ifdef CONFIG_SMP - rq->sd = NULL; - rq->rd = NULL; - rq->active_balance = 0; - rq->next_balance = jiffies; - rq->push_cpu = 0; - rq->cpu = i; - rq->online = 0; - rq->migration_thread = NULL; - INIT_LIST_HEAD(&rq->migration_queue); - rq_attach_root(rq, &def_root_domain); -#endif - init_rq_hrtick(rq); - atomic_set(&rq->nr_iowait, 0); - } - - set_load_weight(&init_task); - -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif - -#ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#endif - -#ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); -#endif - - /* - * The boot idle thread does lazy MMU switching as well: - */ - atomic_inc(&init_mm.mm_count); - enter_lazy_tlb(&init_mm, current); - - /* - * Make us the idle thread. Technically, schedule() should not be - * called from this thread, however somewhere below it might be, - * but because we are the idle thread, we just pick up running again - * when this runqueue becomes "idle". - */ - init_idle(current, smp_processor_id()); - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - - /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_bootmem_cpumask_var(&nohz_cpu_mask); -#ifdef CONFIG_SMP -#ifdef CONFIG_NO_HZ - alloc_bootmem_cpumask_var(&nohz.cpu_mask); -#endif - alloc_bootmem_cpumask_var(&cpu_isolated_map); -#endif /* SMP */ - - scheduler_running = 1; -} - -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) -{ -#ifdef in_atomic - static unsigned long prev_jiffy; /* ratelimiting */ - - if ((!in_atomic() && !irqs_disabled()) || - system_state != SYSTEM_RUNNING || oops_in_progress) - return; - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); - - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); -#endif -} -EXPORT_SYMBOL(__might_sleep); -#endif - -#ifdef CONFIG_MAGIC_SYSRQ -static void normalize_task(struct rq *rq, struct task_struct *p) -{ - int on_rq; - - update_rq_clock(rq); - on_rq = p->se.on_rq; - if (on_rq) - deactivate_task(rq, p, 0); - __setscheduler(rq, p, SCHED_NORMAL, 0); - if (on_rq) { - activate_task(rq, p, 0); - resched_task(rq->curr); - } -} - -void normalize_rt_tasks(void) -{ - struct task_struct *g, *p; - unsigned long flags; - struct rq *rq; - - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { - /* - * Only normalize user tasks: - */ - if (!p->mm) - continue; - - p->se.exec_start = 0; -#ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.sleep_start = 0; - p->se.block_start = 0; -#endif - - if (!rt_task(p)) { - /* - * Renice negative nice level userspace - * tasks back to 0: - */ - if (TASK_NICE(p) < 0 && p->mm) - set_user_nice(p, 0); - continue; - } - - spin_lock(&p->pi_lock); - rq = __task_rq_lock(p); - - normalize_task(rq, p); - - __task_rq_unlock(rq); - spin_unlock(&p->pi_lock); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); -} - -#endif /* CONFIG_MAGIC_SYSRQ */ - -#ifdef CONFIG_IA64 -/* - * These functions are only useful for the IA64 MCA handling. - * - * They can only be called when the whole system has been - * stopped - every CPU needs to be quiescent, and no scheduling - * activity can take place. Using them for anything else would - * be a serious bug, and as a result, they aren't even visible - * under any other configuration. - */ - -/** - * curr_task - return the current task for a given cpu. - * @cpu: the processor in question. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - */ -struct task_struct *curr_task(int cpu) -{ - return cpu_curr(cpu); -} - -/** - * set_curr_task - set the current task for a given cpu. - * @cpu: the processor in question. - * @p: the task pointer to set. - * - * Description: This function must only be used when non-maskable interrupts - * are serviced on a separate stack. It allows the architecture to switch the - * notion of the current task on a cpu in a non-blocking manner. This function - * must be called with all CPU's synchronized, and interrupts disabled, the - * and caller must save the original value of the current task (see - * curr_task() above) and restore that value before reenabling interrupts and - * re-starting the system. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - */ -void set_curr_task(int cpu, struct task_struct *p) -{ - cpu_curr(cpu) = p; -} - -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void free_fair_sched_group(struct task_group *tg) -{ - int i; - - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); - } - - kfree(tg->cfs_rq); - kfree(tg->se); -} - -static -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se; - struct rq *rq; - int i; - - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->cfs_rq) - goto err; - tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->se) - goto err; - - tg->shares = NICE_0_LOAD; - - for_each_possible_cpu(i) { - rq = cpu_rq(i); - - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!cfs_rq) - goto err; - - se = kzalloc_node(sizeof(struct sched_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!se) - goto err; - - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); - } - - return 1; - - err: - return 0; -} - -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, - &cpu_rq(cpu)->leaf_cfs_rq_list); -} - -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ - list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); -} -#else /* !CONFG_FAIR_GROUP_SCHED */ -static inline void free_fair_sched_group(struct task_group *tg) -{ -} - -static inline -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ -} - -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ -} -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static void free_rt_sched_group(struct task_group *tg) -{ - int i; - - destroy_rt_bandwidth(&tg->rt_bandwidth); - - for_each_possible_cpu(i) { - if (tg->rt_rq) - kfree(tg->rt_rq[i]); - if (tg->rt_se) - kfree(tg->rt_se[i]); - } - - kfree(tg->rt_rq); - kfree(tg->rt_se); -} - -static -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; - struct rq *rq; - int i; - - tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_rq) - goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_se) - goto err; - - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); - - for_each_possible_cpu(i) { - rq = cpu_rq(i); - - rt_rq = kzalloc_node(sizeof(struct rt_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_rq) - goto err; - - rt_se = kzalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_se) - goto err; - - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); - } - - return 1; - - err: - return 0; -} - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, - &cpu_rq(cpu)->leaf_rt_rq_list); -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ - list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); -} -#else /* !CONFIG_RT_GROUP_SCHED */ -static inline void free_rt_sched_group(struct task_group *tg) -{ -} - -static inline -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_GROUP_SCHED -static void free_sched_group(struct task_group *tg) -{ - free_fair_sched_group(tg); - free_rt_sched_group(tg); - kfree(tg); -} - -/* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(struct task_group *parent) -{ - struct task_group *tg; - unsigned long flags; - int i; - - tg = kzalloc(sizeof(*tg), GFP_KERNEL); - if (!tg) - return ERR_PTR(-ENOMEM); - - if (!alloc_fair_sched_group(tg, parent)) - goto err; - - if (!alloc_rt_sched_group(tg, parent)) - goto err; - - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { - register_fair_sched_group(tg, i); - register_rt_sched_group(tg, i); - } - list_add_rcu(&tg->list, &task_groups); - - WARN_ON(!parent); /* root should already exist */ - - tg->parent = parent; - INIT_LIST_HEAD(&tg->children); - list_add_rcu(&tg->siblings, &parent->children); - spin_unlock_irqrestore(&task_group_lock, flags); - - return tg; - -err: - free_sched_group(tg); - return ERR_PTR(-ENOMEM); -} - -/* rcu callback to free various structures associated with a task group */ -static void free_sched_group_rcu(struct rcu_head *rhp) -{ - /* now it should be safe to free those cfs_rqs */ - free_sched_group(container_of(rhp, struct task_group, rcu)); -} - -/* Destroy runqueue etc associated with a task group */ -void sched_destroy_group(struct task_group *tg) -{ - unsigned long flags; - int i; - - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { - unregister_fair_sched_group(tg, i); - unregister_rt_sched_group(tg, i); - } - list_del_rcu(&tg->list); - list_del_rcu(&tg->siblings); - spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); -} - -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) -{ - int on_rq, running; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(tsk, &flags); - - update_rq_clock(rq); - - running = task_current(rq, tsk); - on_rq = tsk->se.on_rq; - - if (on_rq) - dequeue_task(rq, tsk, 0); - if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); - - set_task_rq(tsk, task_cpu(tsk)); - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk); -#endif - - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, tsk, 0); - - task_rq_unlock(rq, &flags); -} -#endif /* CONFIG_GROUP_SCHED */ - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void __set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - int on_rq; - - on_rq = se->on_rq; - if (on_rq) - dequeue_entity(cfs_rq, se, 0); - - se->load.weight = shares; - se->load.inv_weight = 0; - - if (on_rq) - enqueue_entity(cfs_rq, se, 0); -} - -static void set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __set_se_shares(se, shares); - spin_unlock_irqrestore(&rq->lock, flags); -} - -static DEFINE_MUTEX(shares_mutex); - -int sched_group_set_shares(struct task_group *tg, unsigned long shares) -{ - int i; - unsigned long flags; - - /* - * We can't change the weight of the root cgroup. - */ - if (!tg->se[0]) - return -EINVAL; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; - - mutex_lock(&shares_mutex); - if (tg->shares == shares) - goto done; - - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); - list_del_rcu(&tg->siblings); - spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for any ongoing reference to this group to finish */ - synchronize_sched(); - - /* - * Now we are free to modify the group's share on each cpu - * w/o tripping rebalance_share or load_balance_fair. - */ - tg->shares = shares; - for_each_possible_cpu(i) { - /* - * force a rebalance - */ - cfs_rq_set_shares(tg->cfs_rq[i], 0); - set_se_shares(tg->se[i], shares); - } - - /* - * Enable load balance activity on this group, by inserting it back on - * each cpu's rq->leaf_cfs_rq_list. - */ - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - register_fair_sched_group(tg, i); - list_add_rcu(&tg->siblings, &tg->parent->children); - spin_unlock_irqrestore(&task_group_lock, flags); -done: - mutex_unlock(&shares_mutex); - return 0; -} - -unsigned long sched_group_shares(struct task_group *tg) -{ - return tg->shares; -} -#endif - -#ifdef CONFIG_RT_GROUP_SCHED -/* - * Ensure that the real time constraints are schedulable. - */ -static DEFINE_MUTEX(rt_constraints_mutex); - -static unsigned long to_ratio(u64 period, u64 runtime) -{ - if (runtime == RUNTIME_INF) - return 1ULL << 20; - - return div64_u64(runtime << 20, period); -} - -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) -{ - struct task_struct *g, *p; - - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - - return 0; -} - -struct rt_schedulable_data { - struct task_group *tg; - u64 rt_period; - u64 rt_runtime; -}; - -static int tg_schedulable(struct task_group *tg, void *data) -{ - struct rt_schedulable_data *d = data; - struct task_group *child; - unsigned long total, sum = 0; - u64 period, runtime; - - period = ktime_to_ns(tg->rt_bandwidth.rt_period); - runtime = tg->rt_bandwidth.rt_runtime; - - if (tg == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; - } - -#ifdef CONFIG_USER_SCHED - if (tg == &root_task_group) { - period = global_rt_period(); - runtime = global_rt_runtime(); - } -#endif - - /* - * Cannot have more runtime than the period. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - - /* - * Ensure we don't starve existing RT tasks. - */ - if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) - return -EBUSY; - - total = to_ratio(period, runtime); - - /* - * Nobody can have more than the global setting allows. - */ - if (total > to_ratio(global_rt_period(), global_rt_runtime())) - return -EINVAL; - - /* - * The sum of our children's runtime should not exceed our own. - */ - list_for_each_entry_rcu(child, &tg->children, siblings) { - period = ktime_to_ns(child->rt_bandwidth.rt_period); - runtime = child->rt_bandwidth.rt_runtime; - - if (child == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; - } - - sum += to_ratio(period, runtime); - } - - if (sum > total) - return -EINVAL; - - return 0; -} - -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct rt_schedulable_data data = { - .tg = tg, - .rt_period = period, - .rt_runtime = runtime, - }; - - return walk_tg_tree(tg_schedulable, tg_nop, &data); -} - -static int tg_set_bandwidth(struct task_group *tg, - u64 rt_period, u64 rt_runtime) -{ - int i, err = 0; - - mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - err = __rt_schedulable(tg, rt_period, rt_runtime); - if (err) - goto unlock; - - spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); - tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); - tg->rt_bandwidth.rt_runtime = rt_runtime; - - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = tg->rt_rq[i]; - - spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = rt_runtime; - spin_unlock(&rt_rq->rt_runtime_lock); - } - spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); - unlock: - read_unlock(&tasklist_lock); - mutex_unlock(&rt_constraints_mutex); - - return err; -} - -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) -{ - u64 rt_runtime, rt_period; - - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us < 0) - rt_runtime = RUNTIME_INF; - - return tg_set_bandwidth(tg, rt_period, rt_runtime); -} - -long sched_group_rt_runtime(struct task_group *tg) -{ - u64 rt_runtime_us; - - if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) - return -1; - - rt_runtime_us = tg->rt_bandwidth.rt_runtime; - do_div(rt_runtime_us, NSEC_PER_USEC); - return rt_runtime_us; -} - -int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) -{ - u64 rt_runtime, rt_period; - - rt_period = (u64)rt_period_us * NSEC_PER_USEC; - rt_runtime = tg->rt_bandwidth.rt_runtime; - - if (rt_period == 0) - return -EINVAL; - - return tg_set_bandwidth(tg, rt_period, rt_runtime); -} - -long sched_group_rt_period(struct task_group *tg) -{ - u64 rt_period_us; - - rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); - do_div(rt_period_us, NSEC_PER_USEC); - return rt_period_us; -} - -static int sched_rt_global_constraints(void) -{ - u64 runtime, period; - int ret = 0; - - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - runtime = global_rt_runtime(); - period = global_rt_period(); - - /* - * Sanity check on the sysctl variables. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - - mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - ret = __rt_schedulable(NULL, 0, 0); - read_unlock(&tasklist_lock); - mutex_unlock(&rt_constraints_mutex); - - return ret; -} - -int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) -{ - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) - return 0; - - return 1; -} - -#else /* !CONFIG_RT_GROUP_SCHED */ -static int sched_rt_global_constraints(void) -{ - unsigned long flags; - int i; - - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = &cpu_rq(i)->rt; - - spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = global_rt_runtime(); - spin_unlock(&rt_rq->rt_runtime_lock); - } - spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - - return 0; -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - int old_period, old_runtime; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); - old_period = sysctl_sched_rt_period; - old_runtime = sysctl_sched_rt_runtime; - - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); - - if (!ret && write) { - ret = sched_rt_global_constraints(); - if (ret) { - sysctl_sched_rt_period = old_period; - sysctl_sched_rt_runtime = old_runtime; - } else { - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = - ns_to_ktime(global_rt_period()); - } - } - mutex_unlock(&mutex); - - return ret; -} - -#ifdef CONFIG_CGROUP_SCHED - -/* return corresponding task_group object of a cgroup */ -static inline struct task_group *cgroup_tg(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), - struct task_group, css); -} - -static struct cgroup_subsys_state * -cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct task_group *tg, *parent; - - if (!cgrp->parent) { - /* This is early initialization for the top cgroup */ - return &init_task_group.css; - } - - parent = cgroup_tg(cgrp->parent); - tg = sched_create_group(parent); - if (IS_ERR(tg)) - return ERR_PTR(-ENOMEM); - - return &tg->css; -} - -static void -cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct task_group *tg = cgroup_tg(cgrp); - - sched_destroy_group(tg); -} - -static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk) -{ -#ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) - return -EINVAL; -#else - /* We don't support RT-tasks being in separate groups */ - if (tsk->sched_class != &fair_sched_class) - return -EINVAL; -#endif - - return 0; -} - -static void -cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk) -{ - sched_move_task(tsk); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 shareval) -{ - return sched_group_set_shares(cgroup_tg(cgrp), shareval); -} - -static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) -{ - struct task_group *tg = cgroup_tg(cgrp); - - return (u64) tg->shares; -} -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - s64 val) -{ - return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); -} - -static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) -{ - return sched_group_rt_runtime(cgroup_tg(cgrp)); -} - -static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_period_us) -{ - return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); -} - -static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) -{ - return sched_group_rt_period(cgroup_tg(cgrp)); -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -static struct cftype cpu_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED - { - .name = "shares", - .read_u64 = cpu_shares_read_u64, - .write_u64 = cpu_shares_write_u64, - }, -#endif -#ifdef CONFIG_RT_GROUP_SCHED - { - .name = "rt_runtime_us", - .read_s64 = cpu_rt_runtime_read, - .write_s64 = cpu_rt_runtime_write, - }, - { - .name = "rt_period_us", - .read_u64 = cpu_rt_period_read_uint, - .write_u64 = cpu_rt_period_write_uint, - }, -#endif -}; - -static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); -} - -struct cgroup_subsys cpu_cgroup_subsys = { - .name = "cpu", - .create = cpu_cgroup_create, - .destroy = cpu_cgroup_destroy, - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, - .populate = cpu_cgroup_populate, - .subsys_id = cpu_cgroup_subsys_id, - .early_init = 1, -}; - -#endif /* CONFIG_CGROUP_SCHED */ - -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 *cpuusage; - struct cpuacct *parent; -}; - -struct cgroup_subsys cpuacct_subsys; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); - - if (!ca) - return ERR_PTR(-ENOMEM); - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) { - kfree(ca); - return ERR_PTR(-ENOMEM); - } - - if (cgrp->parent) - ca->parent = cgroup_ca(cgrp->parent); - - return &ca->css; -} - -/* destroy an existing cpu accounting group */ -static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpuusage); - kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); - u64 data; - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit platforms. - */ - spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; - spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; -#endif - - return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit write safe on 32-bit platforms. - */ - spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; - spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); - -out: - return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct cpuacct *ca = cgroup_ca(cgroup); - u64 percpu; - int i; - - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); - seq_printf(m, "%llu ", (unsigned long long) percpu); - } - seq_printf(m, "\n"); - return 0; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, - { - .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, - }, - -}; - -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); -} - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -static void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - int cpu; - - if (!cpuacct_subsys.active) - return; - - cpu = task_cpu(tsk); - ca = task_ca(tsk); - - for (; ca; ca = ca->parent) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - } -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .create = cpuacct_create, - .destroy = cpuacct_destroy, - .populate = cpuacct_populate, - .subsys_id = cpuacct_subsys_id, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ -#endif /* !DDE_LINUX */ diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/sched_cpupri.h.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/sched_cpupri.h.svn-base deleted file mode 100644 index 642a94ef..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/sched_cpupri.h.svn-base +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef _LINUX_CPUPRI_H -#define _LINUX_CPUPRI_H - -#include <linux/sched.h> - -#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) -#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) - -#define CPUPRI_INVALID -1 -#define CPUPRI_IDLE 0 -#define CPUPRI_NORMAL 1 -/* values 2-101 are RT priorities 0-99 */ - -struct cpupri_vec { - spinlock_t lock; - int count; - cpumask_var_t mask; -}; - -struct cpupri { - struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - long pri_active[CPUPRI_NR_PRI_WORDS]; - int cpu_to_pri[NR_CPUS]; -}; - -#ifdef CONFIG_SMP -int cpupri_find(struct cpupri *cp, - struct task_struct *p, cpumask_t *lowest_mask); -void cpupri_set(struct cpupri *cp, int cpu, int pri); -int cpupri_init(struct cpupri *cp, bool bootmem); -void cpupri_cleanup(struct cpupri *cp); -#else -#define cpupri_set(cp, cpu, pri) do { } while (0) -#define cpupri_init() do { } while (0) -#endif - -#endif /* _LINUX_CPUPRI_H */ diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/sys.c.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/sys.c.svn-base deleted file mode 100644 index 6533cb97..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/sys.c.svn-base +++ /dev/null @@ -1,1893 +0,0 @@ -/* - * linux/kernel/sys.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/utsname.h> -#include <linux/mman.h> -#include <linux/smp_lock.h> -#include <linux/notifier.h> -#include <linux/reboot.h> -#include <linux/prctl.h> -#include <linux/highuid.h> -#include <linux/fs.h> -#include <linux/resource.h> -#include <linux/kernel.h> -#include <linux/kexec.h> -#include <linux/workqueue.h> -#include <linux/capability.h> -#include <linux/device.h> -#include <linux/key.h> -#include <linux/times.h> -#include <linux/posix-timers.h> -#include <linux/security.h> -#include <linux/dcookies.h> -#include <linux/suspend.h> -#include <linux/tty.h> -#include <linux/signal.h> -#include <linux/cn_proc.h> -#include <linux/getcpu.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/seccomp.h> -#include <linux/cpu.h> -#include <linux/ptrace.h> - -#include <linux/compat.h> -#include <linux/syscalls.h> -#include <linux/kprobes.h> -#include <linux/user_namespace.h> - -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/unistd.h> - -#ifndef SET_UNALIGN_CTL -# define SET_UNALIGN_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_UNALIGN_CTL -# define GET_UNALIGN_CTL(a,b) (-EINVAL) -#endif -#ifndef SET_FPEMU_CTL -# define SET_FPEMU_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_FPEMU_CTL -# define GET_FPEMU_CTL(a,b) (-EINVAL) -#endif -#ifndef SET_FPEXC_CTL -# define SET_FPEXC_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_FPEXC_CTL -# define GET_FPEXC_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_ENDIAN -# define GET_ENDIAN(a,b) (-EINVAL) -#endif -#ifndef SET_ENDIAN -# define SET_ENDIAN(a,b) (-EINVAL) -#endif -#ifndef GET_TSC_CTL -# define GET_TSC_CTL(a) (-EINVAL) -#endif -#ifndef SET_TSC_CTL -# define SET_TSC_CTL(a) (-EINVAL) -#endif - -#ifndef DDE_LINUX -/* - * this is where the system-wide overflow UID and GID are defined, for - * architectures that now have 32-bit UID/GID but didn't in the past - */ - -int overflowuid = DEFAULT_OVERFLOWUID; -int overflowgid = DEFAULT_OVERFLOWGID; - -#ifdef CONFIG_UID16 -EXPORT_SYMBOL(overflowuid); -EXPORT_SYMBOL(overflowgid); -#endif - -/* - * the same as above, but for filesystems which can only store a 16-bit - * UID and GID. as such, this is needed on all architectures - */ - -int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; -int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; - -EXPORT_SYMBOL(fs_overflowuid); -EXPORT_SYMBOL(fs_overflowgid); - -/* - * this indicates whether you can reboot with ctrl-alt-del: the default is yes - */ - -int C_A_D = 1; -#endif /* DDE_LINUX */ -struct pid *cad_pid; -EXPORT_SYMBOL(cad_pid); - -/* - * If set, this is used for preparing the system to power off. - */ - -void (*pm_power_off_prepare)(void); - -#ifndef DDE_LINUX -/* - * set the priority of a task - * - the caller must hold the RCU read lock - */ -static int set_one_prio(struct task_struct *p, int niceval, int error) -{ - const struct cred *cred = current_cred(), *pcred = __task_cred(p); - int no_nice; - - if (pcred->uid != cred->euid && - pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { - error = -EPERM; - goto out; - } - if (niceval < task_nice(p) && !can_nice(p, niceval)) { - error = -EACCES; - goto out; - } - no_nice = security_task_setnice(p, niceval); - if (no_nice) { - error = no_nice; - goto out; - } - if (error == -ESRCH) - error = 0; - set_user_nice(p, niceval); -out: - return error; -} - -SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) -{ - struct task_struct *g, *p; - struct user_struct *user; - const struct cred *cred = current_cred(); - int error = -EINVAL; - struct pid *pgrp; - - if (which > PRIO_USER || which < PRIO_PROCESS) - goto out; - - /* normalize: avoid signed division (rounding problems) */ - error = -ESRCH; - if (niceval < -20) - niceval = -20; - if (niceval > 19) - niceval = 19; - - read_lock(&tasklist_lock); - switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) - error = set_one_prio(p, niceval, error); - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - error = set_one_prio(p, niceval, error); - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - user = (struct user_struct *) cred->user; - if (!who) - who = cred->uid; - else if ((who != cred->uid) && - !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) - if (__task_cred(p)->uid == who) - error = set_one_prio(p, niceval, error); - while_each_thread(g, p); - if (who != cred->uid) - free_uid(user); /* For find_user() */ - break; - } -out_unlock: - read_unlock(&tasklist_lock); -out: - return error; -} - -/* - * Ugh. To avoid negative return values, "getpriority()" will - * not return the normal nice-value, but a negated value that - * has been offset by 20 (ie it returns 40..1 instead of -20..19) - * to stay compatible. - */ -SYSCALL_DEFINE2(getpriority, int, which, int, who) -{ - struct task_struct *g, *p; - struct user_struct *user; - const struct cred *cred = current_cred(); - long niceval, retval = -ESRCH; - struct pid *pgrp; - - if (which > PRIO_USER || which < PRIO_PROCESS) - return -EINVAL; - - read_lock(&tasklist_lock); - switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - user = (struct user_struct *) cred->user; - if (!who) - who = cred->uid; - else if ((who != cred->uid) && - !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) - if (__task_cred(p)->uid == who) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } - while_each_thread(g, p); - if (who != cred->uid) - free_uid(user); /* for find_user() */ - break; - } -out_unlock: - read_unlock(&tasklist_lock); - - return retval; -} - -/** - * emergency_restart - reboot the system - * - * Without shutting down any hardware or taking any locks - * reboot the system. This is called when we know we are in - * trouble so this is our best effort to reboot. This is - * safe to call in interrupt context. - */ -void emergency_restart(void) -{ - machine_emergency_restart(); -} -EXPORT_SYMBOL_GPL(emergency_restart); - -void kernel_restart_prepare(char *cmd) -{ - blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); - system_state = SYSTEM_RESTART; - device_shutdown(); - sysdev_shutdown(); -} - -/** - * kernel_restart - reboot the system - * @cmd: pointer to buffer containing command to execute for restart - * or %NULL - * - * Shutdown everything and perform a clean reboot. - * This is not safe to call in interrupt context. - */ -void kernel_restart(char *cmd) -{ - kernel_restart_prepare(cmd); - if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); - else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - machine_restart(cmd); -} -EXPORT_SYMBOL_GPL(kernel_restart); - -static void kernel_shutdown_prepare(enum system_states state) -{ - blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); - system_state = state; - device_shutdown(); -} -/** - * kernel_halt - halt the system - * - * Shutdown everything and perform a clean system halt. - */ -void kernel_halt(void) -{ - kernel_shutdown_prepare(SYSTEM_HALT); - sysdev_shutdown(); - printk(KERN_EMERG "System halted.\n"); - machine_halt(); -} - -EXPORT_SYMBOL_GPL(kernel_halt); - -/** - * kernel_power_off - power_off the system - * - * Shutdown everything and perform a clean system power_off. - */ -void kernel_power_off(void) -{ - kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); - disable_nonboot_cpus(); - sysdev_shutdown(); - printk(KERN_EMERG "Power down.\n"); - machine_power_off(); -} -EXPORT_SYMBOL_GPL(kernel_power_off); -/* - * Reboot system call: for obvious reasons only root may call it, - * and even root needs to set up some magic numbers in the registers - * so that some mistake won't make this reboot the whole machine. - * You can also set the meaning of the ctrl-alt-del-key here. - * - * reboot doesn't sync: do that yourself before calling this. - */ -SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, - void __user *, arg) -{ - char buffer[256]; - - /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) - return -EPERM; - - /* For safety, we require "magic" arguments. */ - if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && - magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) - return -EINVAL; - - /* Instead of trying to make the power_off code look like - * halt when pm_power_off is not set do it the easy way. - */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) - cmd = LINUX_REBOOT_CMD_HALT; - - lock_kernel(); - switch (cmd) { - case LINUX_REBOOT_CMD_RESTART: - kernel_restart(NULL); - break; - - case LINUX_REBOOT_CMD_CAD_ON: - C_A_D = 1; - break; - - case LINUX_REBOOT_CMD_CAD_OFF: - C_A_D = 0; - break; - - case LINUX_REBOOT_CMD_HALT: - kernel_halt(); - unlock_kernel(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_POWER_OFF: - kernel_power_off(); - unlock_kernel(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - unlock_kernel(); - return -EFAULT; - } - buffer[sizeof(buffer) - 1] = '\0'; - - kernel_restart(buffer); - break; - -#ifdef CONFIG_KEXEC - case LINUX_REBOOT_CMD_KEXEC: - { - int ret; - ret = kernel_kexec(); - unlock_kernel(); - return ret; - } -#endif - -#ifdef CONFIG_HIBERNATION - case LINUX_REBOOT_CMD_SW_SUSPEND: - { - int ret = hibernate(); - unlock_kernel(); - return ret; - } -#endif - - default: - unlock_kernel(); - return -EINVAL; - } - unlock_kernel(); - return 0; -} - -static void deferred_cad(struct work_struct *dummy) -{ - kernel_restart(NULL); -} - -/* - * This function gets called by ctrl-alt-del - ie the keyboard interrupt. - * As it's called within an interrupt, it may NOT sync: the only choice - * is whether to reboot at once, or just ignore the ctrl-alt-del. - */ -void ctrl_alt_del(void) -{ - static DECLARE_WORK(cad_work, deferred_cad); - - if (C_A_D) - schedule_work(&cad_work); - else - kill_cad_pid(SIGINT, 1); -} - -/* - * Unprivileged users may change the real gid to the effective gid - * or vice versa. (BSD-style) - * - * If you set the real gid at all, or set the effective gid to a value not - * equal to the real gid, then the saved gid is set to the new effective gid. - * - * This makes it possible for a setgid program to completely drop its - * privileges, which is often a useful assertion to make when you are doing - * a security audit over a program. - * - * The general idea is that a program which uses just setregid() will be - * 100% compatible with BSD. A program which uses just setgid() will be - * 100% compatible with POSIX with saved IDs. - * - * SMP: There are not races, the GIDs are checked only by filesystem - * operations (as far as semantic preservation is concerned). - */ -SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - old = current_cred(); - - retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); - if (retval) - goto error; - - retval = -EPERM; - if (rgid != (gid_t) -1) { - if (old->gid == rgid || - old->egid == rgid || - capable(CAP_SETGID)) - new->gid = rgid; - else - goto error; - } - if (egid != (gid_t) -1) { - if (old->gid == egid || - old->egid == egid || - old->sgid == egid || - capable(CAP_SETGID)) - new->egid = egid; - else - goto error; - } - - if (rgid != (gid_t) -1 || - (egid != (gid_t) -1 && egid != old->gid)) - new->sgid = new->egid; - new->fsgid = new->egid; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - -/* - * setgid() is implemented like SysV w/ SAVED_IDS - * - * SMP: Same implicit races as above. - */ -SYSCALL_DEFINE1(setgid, gid_t, gid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - old = current_cred(); - - retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); - if (retval) - goto error; - - retval = -EPERM; - if (capable(CAP_SETGID)) - new->gid = new->egid = new->sgid = new->fsgid = gid; - else if (gid == old->gid || gid == old->sgid) - new->egid = new->fsgid = gid; - else - goto error; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - -/* - * change the user struct in a credentials set to match the new UID - */ -static int set_user(struct cred *new) -{ - struct user_struct *new_user; - - new_user = alloc_uid(current_user_ns(), new->uid); - if (!new_user) - return -EAGAIN; - - if (!task_can_switch_user(new_user, current)) { - free_uid(new_user); - return -EINVAL; - } - - if (atomic_read(&new_user->processes) >= - current->signal->rlim[RLIMIT_NPROC].rlim_cur && - new_user != INIT_USER) { - free_uid(new_user); - return -EAGAIN; - } - - free_uid(new->user); - new->user = new_user; - return 0; -} - -/* - * Unprivileged users may change the real uid to the effective uid - * or vice versa. (BSD-style) - * - * If you set the real uid at all, or set the effective uid to a value not - * equal to the real uid, then the saved uid is set to the new effective uid. - * - * This makes it possible for a setuid program to completely drop its - * privileges, which is often a useful assertion to make when you are doing - * a security audit over a program. - * - * The general idea is that a program which uses just setreuid() will be - * 100% compatible with BSD. A program which uses just setuid() will be - * 100% compatible with POSIX with saved IDs. - */ -SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - old = current_cred(); - - retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); - if (retval) - goto error; - - retval = -EPERM; - if (ruid != (uid_t) -1) { - new->uid = ruid; - if (old->uid != ruid && - old->euid != ruid && - !capable(CAP_SETUID)) - goto error; - } - - if (euid != (uid_t) -1) { - new->euid = euid; - if (old->uid != euid && - old->euid != euid && - old->suid != euid && - !capable(CAP_SETUID)) - goto error; - } - - if (new->uid != old->uid) { - retval = set_user(new); - if (retval < 0) - goto error; - } - if (ruid != (uid_t) -1 || - (euid != (uid_t) -1 && euid != old->uid)) - new->suid = new->euid; - new->fsuid = new->euid; - - retval = security_task_fix_setuid(new, old, LSM_SETID_RE); - if (retval < 0) - goto error; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - -/* - * setuid() is implemented like SysV with SAVED_IDS - * - * Note that SAVED_ID's is deficient in that a setuid root program - * like sendmail, for example, cannot set its uid to be a normal - * user and then switch back, because if you're root, setuid() sets - * the saved uid too. If you don't like this, blame the bright people - * in the POSIX committee and/or USG. Note that the BSD-style setreuid() - * will allow a root program to temporarily drop privileges and be able to - * regain them by swapping the real and effective uid. - */ -SYSCALL_DEFINE1(setuid, uid_t, uid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - old = current_cred(); - - retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); - if (retval) - goto error; - - retval = -EPERM; - if (capable(CAP_SETUID)) { - new->suid = new->uid = uid; - if (uid != old->uid) { - retval = set_user(new); - if (retval < 0) - goto error; - } - } else if (uid != old->uid && uid != new->suid) { - goto error; - } - - new->fsuid = new->euid = uid; - - retval = security_task_fix_setuid(new, old, LSM_SETID_ID); - if (retval < 0) - goto error; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - - -/* - * This function implements a generic ability to update ruid, euid, - * and suid. This allows you to implement the 4.4 compatible seteuid(). - */ -SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); - if (retval) - goto error; - old = current_cred(); - - retval = -EPERM; - if (!capable(CAP_SETUID)) { - if (ruid != (uid_t) -1 && ruid != old->uid && - ruid != old->euid && ruid != old->suid) - goto error; - if (euid != (uid_t) -1 && euid != old->uid && - euid != old->euid && euid != old->suid) - goto error; - if (suid != (uid_t) -1 && suid != old->uid && - suid != old->euid && suid != old->suid) - goto error; - } - - if (ruid != (uid_t) -1) { - new->uid = ruid; - if (ruid != old->uid) { - retval = set_user(new); - if (retval < 0) - goto error; - } - } - if (euid != (uid_t) -1) - new->euid = euid; - if (suid != (uid_t) -1) - new->suid = suid; - new->fsuid = new->euid; - - retval = security_task_fix_setuid(new, old, LSM_SETID_RES); - if (retval < 0) - goto error; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - -SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) -{ - const struct cred *cred = current_cred(); - int retval; - - if (!(retval = put_user(cred->uid, ruid)) && - !(retval = put_user(cred->euid, euid))) - retval = put_user(cred->suid, suid); - - return retval; -} - -/* - * Same as above, but for rgid, egid, sgid. - */ -SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - old = current_cred(); - - retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); - if (retval) - goto error; - - retval = -EPERM; - if (!capable(CAP_SETGID)) { - if (rgid != (gid_t) -1 && rgid != old->gid && - rgid != old->egid && rgid != old->sgid) - goto error; - if (egid != (gid_t) -1 && egid != old->gid && - egid != old->egid && egid != old->sgid) - goto error; - if (sgid != (gid_t) -1 && sgid != old->gid && - sgid != old->egid && sgid != old->sgid) - goto error; - } - - if (rgid != (gid_t) -1) - new->gid = rgid; - if (egid != (gid_t) -1) - new->egid = egid; - if (sgid != (gid_t) -1) - new->sgid = sgid; - new->fsgid = new->egid; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - -SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) -{ - const struct cred *cred = current_cred(); - int retval; - - if (!(retval = put_user(cred->gid, rgid)) && - !(retval = put_user(cred->egid, egid))) - retval = put_user(cred->sgid, sgid); - - return retval; -} - - -/* - * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This - * is used for "access()" and for the NFS daemon (letting nfsd stay at - * whatever uid it wants to). It normally shadows "euid", except when - * explicitly set by setfsuid() or for access.. - */ -SYSCALL_DEFINE1(setfsuid, uid_t, uid) -{ - const struct cred *old; - struct cred *new; - uid_t old_fsuid; - - new = prepare_creds(); - if (!new) - return current_fsuid(); - old = current_cred(); - old_fsuid = old->fsuid; - - if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0) - goto error; - - if (uid == old->uid || uid == old->euid || - uid == old->suid || uid == old->fsuid || - capable(CAP_SETUID)) { - if (uid != old_fsuid) { - new->fsuid = uid; - if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) - goto change_okay; - } - } - -error: - abort_creds(new); - return old_fsuid; - -change_okay: - commit_creds(new); - return old_fsuid; -} - -/* - * Samma på svenska.. - */ -SYSCALL_DEFINE1(setfsgid, gid_t, gid) -{ - const struct cred *old; - struct cred *new; - gid_t old_fsgid; - - new = prepare_creds(); - if (!new) - return current_fsgid(); - old = current_cred(); - old_fsgid = old->fsgid; - - if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) - goto error; - - if (gid == old->gid || gid == old->egid || - gid == old->sgid || gid == old->fsgid || - capable(CAP_SETGID)) { - if (gid != old_fsgid) { - new->fsgid = gid; - goto change_okay; - } - } - -error: - abort_creds(new); - return old_fsgid; - -change_okay: - commit_creds(new); - return old_fsgid; -} - -void do_sys_times(struct tms *tms) -{ - struct task_cputime cputime; - cputime_t cutime, cstime; - - thread_group_cputime(current, &cputime); - spin_lock_irq(¤t->sighand->siglock); - cutime = current->signal->cutime; - cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); - tms->tms_utime = cputime_to_clock_t(cputime.utime); - tms->tms_stime = cputime_to_clock_t(cputime.stime); - tms->tms_cutime = cputime_to_clock_t(cutime); - tms->tms_cstime = cputime_to_clock_t(cstime); -} - -SYSCALL_DEFINE1(times, struct tms __user *, tbuf) -{ - if (tbuf) { - struct tms tmp; - - do_sys_times(&tmp); - if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) - return -EFAULT; - } - force_successful_syscall_return(); - return (long) jiffies_64_to_clock_t(get_jiffies_64()); -} - -/* - * This needs some heavy checking ... - * I just haven't the stomach for it. I also don't fully - * understand sessions/pgrp etc. Let somebody who does explain it. - * - * OK, I think I have the protection semantics right.... this is really - * only important on a multi-user system anyway, to make sure one user - * can't send a signal to a process owned by another. -TYT, 12/12/91 - * - * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. - * LBT 04.03.94 - */ -SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) -{ - struct task_struct *p; - struct task_struct *group_leader = current->group_leader; - struct pid *pgrp; - int err; - - if (!pid) - pid = task_pid_vnr(group_leader); - if (!pgid) - pgid = pid; - if (pgid < 0) - return -EINVAL; - - /* From this point forward we keep holding onto the tasklist lock - * so that our parent does not change from under us. -DaveM - */ - write_lock_irq(&tasklist_lock); - - err = -ESRCH; - p = find_task_by_vpid(pid); - if (!p) - goto out; - - err = -EINVAL; - if (!thread_group_leader(p)) - goto out; - - if (same_thread_group(p->real_parent, group_leader)) { - err = -EPERM; - if (task_session(p) != task_session(group_leader)) - goto out; - err = -EACCES; - if (p->did_exec) - goto out; - } else { - err = -ESRCH; - if (p != group_leader) - goto out; - } - - err = -EPERM; - if (p->signal->leader) - goto out; - - pgrp = task_pid(p); - if (pgid != pid) { - struct task_struct *g; - - pgrp = find_vpid(pgid); - g = pid_task(pgrp, PIDTYPE_PGID); - if (!g || task_session(g) != task_session(group_leader)) - goto out; - } - - err = security_task_setpgid(p, pgid); - if (err) - goto out; - - if (task_pgrp(p) != pgrp) { - change_pid(p, PIDTYPE_PGID, pgrp); - set_task_pgrp(p, pid_nr(pgrp)); - } - - err = 0; -out: - /* All paths lead to here, thus we are safe. -DaveM */ - write_unlock_irq(&tasklist_lock); - return err; -} - -SYSCALL_DEFINE1(getpgid, pid_t, pid) -{ - struct task_struct *p; - struct pid *grp; - int retval; - - rcu_read_lock(); - if (!pid) - grp = task_pgrp(current); - else { - retval = -ESRCH; - p = find_task_by_vpid(pid); - if (!p) - goto out; - grp = task_pgrp(p); - if (!grp) - goto out; - - retval = security_task_getpgid(p); - if (retval) - goto out; - } - retval = pid_vnr(grp); -out: - rcu_read_unlock(); - return retval; -} - -#ifdef __ARCH_WANT_SYS_GETPGRP - -SYSCALL_DEFINE0(getpgrp) -{ - return sys_getpgid(0); -} - -#endif - -SYSCALL_DEFINE1(getsid, pid_t, pid) -{ - struct task_struct *p; - struct pid *sid; - int retval; - - rcu_read_lock(); - if (!pid) - sid = task_session(current); - else { - retval = -ESRCH; - p = find_task_by_vpid(pid); - if (!p) - goto out; - sid = task_session(p); - if (!sid) - goto out; - - retval = security_task_getsid(p); - if (retval) - goto out; - } - retval = pid_vnr(sid); -out: - rcu_read_unlock(); - return retval; -} - -SYSCALL_DEFINE0(setsid) -{ - struct task_struct *group_leader = current->group_leader; - struct pid *sid = task_pid(group_leader); - pid_t session = pid_vnr(sid); - int err = -EPERM; - - write_lock_irq(&tasklist_lock); - /* Fail if I am already a session leader */ - if (group_leader->signal->leader) - goto out; - - /* Fail if a process group id already exists that equals the - * proposed session id. - */ - if (pid_task(sid, PIDTYPE_PGID)) - goto out; - - group_leader->signal->leader = 1; - __set_special_pids(sid); - - proc_clear_tty(group_leader); - - err = session; -out: - write_unlock_irq(&tasklist_lock); - return err; -} - -/* - * Supplementary group IDs - */ - -/* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; - -struct group_info *groups_alloc(int gidsetsize) -{ - struct group_info *group_info; - int nblocks; - int i; - - nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; - /* Make sure we always allocate at least one indirect block pointer */ - nblocks = nblocks ? : 1; - group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); - if (!group_info) - return NULL; - group_info->ngroups = gidsetsize; - group_info->nblocks = nblocks; - atomic_set(&group_info->usage, 1); - - if (gidsetsize <= NGROUPS_SMALL) - group_info->blocks[0] = group_info->small_block; - else { - for (i = 0; i < nblocks; i++) { - gid_t *b; - b = (void *)__get_free_page(GFP_USER); - if (!b) - goto out_undo_partial_alloc; - group_info->blocks[i] = b; - } - } - return group_info; - -out_undo_partial_alloc: - while (--i >= 0) { - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); - return NULL; -} - -EXPORT_SYMBOL(groups_alloc); - -void groups_free(struct group_info *group_info) -{ - if (group_info->blocks[0] != group_info->small_block) { - int i; - for (i = 0; i < group_info->nblocks; i++) - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); -} - -EXPORT_SYMBOL(groups_free); - -/* export the group_info to a user-space array */ -static int groups_to_user(gid_t __user *grouplist, - const struct group_info *group_info) -{ - int i; - unsigned int count = group_info->ngroups; - - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); - - if (copy_to_user(grouplist, group_info->blocks[i], len)) - return -EFAULT; - - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; - } - return 0; -} - -/* fill a group_info from a user-space array - it must be allocated already */ -static int groups_from_user(struct group_info *group_info, - gid_t __user *grouplist) -{ - int i; - unsigned int count = group_info->ngroups; - - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); - - if (copy_from_user(group_info->blocks[i], grouplist, len)) - return -EFAULT; - - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; - } - return 0; -} - -/* a simple Shell sort */ -static void groups_sort(struct group_info *group_info) -{ - int base, max, stride; - int gidsetsize = group_info->ngroups; - - for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) - ; /* nothing */ - stride /= 3; - - while (stride) { - max = gidsetsize - stride; - for (base = 0; base < max; base++) { - int left = base; - int right = left + stride; - gid_t tmp = GROUP_AT(group_info, right); - - while (left >= 0 && GROUP_AT(group_info, left) > tmp) { - GROUP_AT(group_info, right) = - GROUP_AT(group_info, left); - right = left; - left -= stride; - } - GROUP_AT(group_info, right) = tmp; - } - stride /= 3; - } -} - -/* a simple bsearch */ -int groups_search(const struct group_info *group_info, gid_t grp) -{ - unsigned int left, right; - - if (!group_info) - return 0; - - left = 0; - right = group_info->ngroups; - while (left < right) { - unsigned int mid = (left+right)/2; - int cmp = grp - GROUP_AT(group_info, mid); - if (cmp > 0) - left = mid + 1; - else if (cmp < 0) - right = mid; - else - return 1; - } - return 0; -} - -/** - * set_groups - Change a group subscription in a set of credentials - * @new: The newly prepared set of credentials to alter - * @group_info: The group list to install - * - * Validate a group subscription and, if valid, insert it into a set - * of credentials. - */ -int set_groups(struct cred *new, struct group_info *group_info) -{ - int retval; - - retval = security_task_setgroups(group_info); - if (retval) - return retval; - - put_group_info(new->group_info); - groups_sort(group_info); - get_group_info(group_info); - new->group_info = group_info; - return 0; -} - -EXPORT_SYMBOL(set_groups); - -/** - * set_current_groups - Change current's group subscription - * @group_info: The group list to impose - * - * Validate a group subscription and, if valid, impose it upon current's task - * security record. - */ -int set_current_groups(struct group_info *group_info) -{ - struct cred *new; - int ret; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - ret = set_groups(new, group_info); - if (ret < 0) { - abort_creds(new); - return ret; - } - - return commit_creds(new); -} - -EXPORT_SYMBOL(set_current_groups); - -SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) -{ - const struct cred *cred = current_cred(); - int i; - - if (gidsetsize < 0) - return -EINVAL; - - /* no need to grab task_lock here; it cannot change */ - i = cred->group_info->ngroups; - if (gidsetsize) { - if (i > gidsetsize) { - i = -EINVAL; - goto out; - } - if (groups_to_user(grouplist, cred->group_info)) { - i = -EFAULT; - goto out; - } - } -out: - return i; -} - -/* - * SMP: Our groups are copy-on-write. We can set them safely - * without another task interfering. - */ - -SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) -{ - struct group_info *group_info; - int retval; - - if (!capable(CAP_SETGID)) - return -EPERM; - if ((unsigned)gidsetsize > NGROUPS_MAX) - return -EINVAL; - - group_info = groups_alloc(gidsetsize); - if (!group_info) - return -ENOMEM; - retval = groups_from_user(group_info, grouplist); - if (retval) { - put_group_info(group_info); - return retval; - } - - retval = set_current_groups(group_info); - put_group_info(group_info); - - return retval; -} - -/* - * Check whether we're fsgid/egid or in the supplemental group.. - */ -int in_group_p(gid_t grp) -{ - const struct cred *cred = current_cred(); - int retval = 1; - - if (grp != cred->fsgid) - retval = groups_search(cred->group_info, grp); - return retval; -} - -EXPORT_SYMBOL(in_group_p); - -int in_egroup_p(gid_t grp) -{ - const struct cred *cred = current_cred(); - int retval = 1; - - if (grp != cred->egid) - retval = groups_search(cred->group_info, grp); - return retval; -} - -EXPORT_SYMBOL(in_egroup_p); - -DECLARE_RWSEM(uts_sem); - -SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) -{ - int errno = 0; - - down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof *name)) - errno = -EFAULT; - up_read(&uts_sem); - return errno; -} - -SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) -{ - int errno; - char tmp[__NEW_UTS_LEN]; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (len < 0 || len > __NEW_UTS_LEN) - return -EINVAL; - down_write(&uts_sem); - errno = -EFAULT; - if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); - - memcpy(u->nodename, tmp, len); - memset(u->nodename + len, 0, sizeof(u->nodename) - len); - errno = 0; - } - up_write(&uts_sem); - return errno; -} - -#ifdef __ARCH_WANT_SYS_GETHOSTNAME - -SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) -{ - int i, errno; - struct new_utsname *u; - - if (len < 0) - return -EINVAL; - down_read(&uts_sem); - u = utsname(); - i = 1 + strlen(u->nodename); - if (i > len) - i = len; - errno = 0; - if (copy_to_user(name, u->nodename, i)) - errno = -EFAULT; - up_read(&uts_sem); - return errno; -} - -#endif - -/* - * Only setdomainname; getdomainname can be implemented by calling - * uname() - */ -SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) -{ - int errno; - char tmp[__NEW_UTS_LEN]; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (len < 0 || len > __NEW_UTS_LEN) - return -EINVAL; - - down_write(&uts_sem); - errno = -EFAULT; - if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); - - memcpy(u->domainname, tmp, len); - memset(u->domainname + len, 0, sizeof(u->domainname) - len); - errno = 0; - } - up_write(&uts_sem); - return errno; -} - -SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) -{ - if (resource >= RLIM_NLIMITS) - return -EINVAL; - else { - struct rlimit value; - task_lock(current->group_leader); - value = current->signal->rlim[resource]; - task_unlock(current->group_leader); - return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; - } -} - -#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT - -/* - * Back compatibility for getrlimit. Needed for some apps. - */ - -SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, - struct rlimit __user *, rlim) -{ - struct rlimit x; - if (resource >= RLIM_NLIMITS) - return -EINVAL; - - task_lock(current->group_leader); - x = current->signal->rlim[resource]; - task_unlock(current->group_leader); - if (x.rlim_cur > 0x7FFFFFFF) - x.rlim_cur = 0x7FFFFFFF; - if (x.rlim_max > 0x7FFFFFFF) - x.rlim_max = 0x7FFFFFFF; - return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; -} - -#endif - -SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) -{ - struct rlimit new_rlim, *old_rlim; - int retval; - - if (resource >= RLIM_NLIMITS) - return -EINVAL; - if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) - return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; - old_rlim = current->signal->rlim + resource; - if ((new_rlim.rlim_max > old_rlim->rlim_max) && - !capable(CAP_SYS_RESOURCE)) - return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) - return -EPERM; - - retval = security_task_setrlimit(resource, &new_rlim); - if (retval) - return retval; - - if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - new_rlim.rlim_cur = 1; - } - - task_lock(current->group_leader); - *old_rlim = new_rlim; - task_unlock(current->group_leader); - - if (resource != RLIMIT_CPU) - goto out; - - /* - * RLIMIT_CPU handling. Note that the kernel fails to return an error - * code if it rejected the user's attempt to set RLIMIT_CPU. This is a - * very long-standing error, and fixing it now risks breakage of - * applications, so we live with it - */ - if (new_rlim.rlim_cur == RLIM_INFINITY) - goto out; - - update_rlimit_cpu(new_rlim.rlim_cur); -out: - return 0; -} - -/* - * It would make sense to put struct rusage in the task_struct, - * except that would make the task_struct be *really big*. After - * task_struct gets moved into malloc'ed memory, it would - * make sense to do this. It will make moving the rest of the information - * a lot simpler! (Which we're not doing right now because we're not - * measuring them yet). - * - * When sampling multiple threads for RUSAGE_SELF, under SMP we might have - * races with threads incrementing their own counters. But since word - * reads are atomic, we either get new values or old values and we don't - * care which for the sums. We always take the siglock to protect reading - * the c* fields from p->signal from races with exit.c updating those - * fields when reaping, so a sample either gets all the additions of a - * given child after it's reaped, or none so this sample is before reaping. - * - * Locking: - * We need to take the siglock for CHILDEREN, SELF and BOTH - * for the cases current multithreaded, non-current single threaded - * non-current multithreaded. Thread traversal is now safe with - * the siglock held. - * Strictly speaking, we donot need to take the siglock if we are current and - * single threaded, as no one else can take our signal_struct away, no one - * else can reap the children to update signal->c* counters, and no one else - * can race with the signal-> fields. If we do not take any lock, the - * signal-> fields could be read out of order while another thread was just - * exiting. So we should place a read memory barrier when we avoid the lock. - * On the writer side, write memory barrier is implied in __exit_signal - * as __exit_signal releases the siglock spinlock after updating the signal-> - * fields. But we don't do this yet to keep things simple. - * - */ - -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) -{ - r->ru_nvcsw += t->nvcsw; - r->ru_nivcsw += t->nivcsw; - r->ru_minflt += t->min_flt; - r->ru_majflt += t->maj_flt; - r->ru_inblock += task_io_get_inblock(t); - r->ru_oublock += task_io_get_oublock(t); -} - -static void k_getrusage(struct task_struct *p, int who, struct rusage *r) -{ - struct task_struct *t; - unsigned long flags; - cputime_t utime, stime; - struct task_cputime cputime; - - memset((char *) r, 0, sizeof *r); - utime = stime = cputime_zero; - - if (who == RUSAGE_THREAD) { - utime = task_utime(current); - stime = task_stime(current); - accumulate_thread_rusage(p, r); - goto out; - } - - if (!lock_task_sighand(p, &flags)) - return; - - switch (who) { - case RUSAGE_BOTH: - case RUSAGE_CHILDREN: - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - r->ru_inblock = p->signal->cinblock; - r->ru_oublock = p->signal->coublock; - - if (who == RUSAGE_CHILDREN) - break; - - case RUSAGE_SELF: - thread_group_cputime(p, &cputime); - utime = cputime_add(utime, cputime.utime); - stime = cputime_add(stime, cputime.stime); - r->ru_nvcsw += p->signal->nvcsw; - r->ru_nivcsw += p->signal->nivcsw; - r->ru_minflt += p->signal->min_flt; - r->ru_majflt += p->signal->maj_flt; - r->ru_inblock += p->signal->inblock; - r->ru_oublock += p->signal->oublock; - t = p; - do { - accumulate_thread_rusage(t, r); - t = next_thread(t); - } while (t != p); - break; - - default: - BUG(); - } - unlock_task_sighand(p, &flags); - -out: - cputime_to_timeval(utime, &r->ru_utime); - cputime_to_timeval(stime, &r->ru_stime); -} - -int getrusage(struct task_struct *p, int who, struct rusage __user *ru) -{ - struct rusage r; - k_getrusage(p, who, &r); - return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; -} - -SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) -{ - if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && - who != RUSAGE_THREAD) - return -EINVAL; - return getrusage(current, who, ru); -} - -SYSCALL_DEFINE1(umask, int, mask) -{ - mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); - return mask; -} - -SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, - unsigned long, arg4, unsigned long, arg5) -{ - struct task_struct *me = current; - unsigned char comm[sizeof(me->comm)]; - long error; - - error = security_task_prctl(option, arg2, arg3, arg4, arg5); - if (error != -ENOSYS) - return error; - - error = 0; - switch (option) { - case PR_SET_PDEATHSIG: - if (!valid_signal(arg2)) { - error = -EINVAL; - break; - } - me->pdeath_signal = arg2; - error = 0; - break; - case PR_GET_PDEATHSIG: - error = put_user(me->pdeath_signal, (int __user *)arg2); - break; - case PR_GET_DUMPABLE: - error = get_dumpable(me->mm); - break; - case PR_SET_DUMPABLE: - if (arg2 < 0 || arg2 > 1) { - error = -EINVAL; - break; - } - set_dumpable(me->mm, arg2); - error = 0; - break; - - case PR_SET_UNALIGN: - error = SET_UNALIGN_CTL(me, arg2); - break; - case PR_GET_UNALIGN: - error = GET_UNALIGN_CTL(me, arg2); - break; - case PR_SET_FPEMU: - error = SET_FPEMU_CTL(me, arg2); - break; - case PR_GET_FPEMU: - error = GET_FPEMU_CTL(me, arg2); - break; - case PR_SET_FPEXC: - error = SET_FPEXC_CTL(me, arg2); - break; - case PR_GET_FPEXC: - error = GET_FPEXC_CTL(me, arg2); - break; - case PR_GET_TIMING: - error = PR_TIMING_STATISTICAL; - break; - case PR_SET_TIMING: - if (arg2 != PR_TIMING_STATISTICAL) - error = -EINVAL; - else - error = 0; - break; - - case PR_SET_NAME: - comm[sizeof(me->comm)-1] = 0; - if (strncpy_from_user(comm, (char __user *)arg2, - sizeof(me->comm) - 1) < 0) - return -EFAULT; - set_task_comm(me, comm); - return 0; - case PR_GET_NAME: - get_task_comm(comm, me); - if (copy_to_user((char __user *)arg2, comm, - sizeof(comm))) - return -EFAULT; - return 0; - case PR_GET_ENDIAN: - error = GET_ENDIAN(me, arg2); - break; - case PR_SET_ENDIAN: - error = SET_ENDIAN(me, arg2); - break; - - case PR_GET_SECCOMP: - error = prctl_get_seccomp(); - break; - case PR_SET_SECCOMP: - error = prctl_set_seccomp(arg2); - break; - case PR_GET_TSC: - error = GET_TSC_CTL(arg2); - break; - case PR_SET_TSC: - error = SET_TSC_CTL(arg2); - break; - case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; - break; - case PR_SET_TIMERSLACK: - if (arg2 <= 0) - current->timer_slack_ns = - current->default_timer_slack_ns; - else - current->timer_slack_ns = arg2; - error = 0; - break; - default: - error = -EINVAL; - break; - } - return error; -} - -SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, - struct getcpu_cache __user *, unused) -{ - int err = 0; - int cpu = raw_smp_processor_id(); - if (cpup) - err |= put_user(cpu, cpup); - if (nodep) - err |= put_user(cpu_to_node(cpu), nodep); - return err ? -EFAULT : 0; -} - -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; - -static void argv_cleanup(char **argv, char **envp) -{ - argv_free(argv); -} - -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) -{ - int argc; - char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); - static char *envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL - }; - int ret = -ENOMEM; - struct subprocess_info *info; - - if (argv == NULL) { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - goto out; - } - - info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); - if (info == NULL) { - argv_free(argv); - goto out; - } - - call_usermodehelper_setcleanup(info, argv_cleanup); - - ret = call_usermodehelper_exec(info, UMH_NO_WAIT); - - out: - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - - /* I guess this should try to kick off some daemon to - sync and poweroff asap. Or not even bother syncing - if we're doing an emergency shutdown? */ - emergency_sync(); - kernel_power_off(); - } - - return ret; -} -EXPORT_SYMBOL_GPL(orderly_poweroff); -#endif /* DDE_LINUX */ diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/time.c.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/time.c.svn-base deleted file mode 100644 index 4e49f06f..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/time.c.svn-base +++ /dev/null @@ -1,762 +0,0 @@ -/* - * linux/kernel/time.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file contains the interface functions for the various - * time related system calls: time, stime, gettimeofday, settimeofday, - * adjtime - */ -/* - * Modification history kernel/time.c - * - * 1993-09-02 Philip Gladstone - * Created file with time related functions from sched.c and adjtimex() - * 1993-10-08 Torsten Duwe - * adjtime interface update and CMOS clock write code - * 1995-08-13 Torsten Duwe - * kernel PLL updated to 1994-12-13 specs (rfc-1589) - * 1999-01-16 Ulrich Windl - * Introduced error checking for many cases in adjtimex(). - * Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) - * (Even though the technical memorandum forbids it) - * 2004-07-14 Christoph Lameter - * Added getnstimeofday to allow the posix timer functions to return - * with nanosecond accuracy - */ - -#include <linux/module.h> -#include <linux/timex.h> -#include <linux/capability.h> -#include <linux/clocksource.h> -#include <linux/errno.h> -#include <linux/syscalls.h> -#include <linux/security.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/math64.h> -#include <linux/ptrace.h> - -#include <asm/uaccess.h> -#include <asm/unistd.h> - -#include "timeconst.h" - -/* - * The timezone where the local system is located. Used as a default by some - * programs who obtain this value by using gettimeofday. - */ -struct timezone sys_tz; - -EXPORT_SYMBOL(sys_tz); - -#ifdef __ARCH_WANT_SYS_TIME - -/* - * sys_time() can be implemented in user-level using - * sys_gettimeofday(). Is this for backwards compatibility? If so, - * why not move it into the appropriate arch directory (for those - * architectures that need it). - */ -SYSCALL_DEFINE1(time, time_t __user *, tloc) -{ - time_t i = get_seconds(); - - if (tloc) { - if (put_user(i,tloc)) - return -EFAULT; - } - force_successful_syscall_return(); - return i; -} - -/* - * sys_stime() can be implemented in user-level using - * sys_settimeofday(). Is this for backwards compatibility? If so, - * why not move it into the appropriate arch directory (for those - * architectures that need it). - */ - -SYSCALL_DEFINE1(stime, time_t __user *, tptr) -{ - struct timespec tv; - int err; - - if (get_user(tv.tv_sec, tptr)) - return -EFAULT; - - tv.tv_nsec = 0; - - err = security_settime(&tv, NULL); - if (err) - return err; - - do_settimeofday(&tv); - return 0; -} - -#endif /* __ARCH_WANT_SYS_TIME */ - -SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, - struct timezone __user *, tz) -{ - if (likely(tv != NULL)) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (copy_to_user(tv, &ktv, sizeof(ktv))) - return -EFAULT; - } - if (unlikely(tz != NULL)) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -/* - * Adjust the time obtained from the CMOS to be UTC time instead of - * local time. - * - * This is ugly, but preferable to the alternatives. Otherwise we - * would either need to write a program to do it in /etc/rc (and risk - * confusion if the program gets run more than once; it would also be - * hard to make the program warp the clock precisely n hours) or - * compile in the timezone information into the kernel. Bad, bad.... - * - * - TYT, 1992-01-01 - * - * The best thing to do is to keep the CMOS clock in universal time (UTC) - * as real UNIX machines always do it. This avoids all headaches about - * daylight saving times and warping kernel clocks. - */ -static inline void warp_clock(void) -{ - write_seqlock_irq(&xtime_lock); - wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; - xtime.tv_sec += sys_tz.tz_minuteswest * 60; - update_xtime_cache(0); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); -} - -/* - * In case for some reason the CMOS clock has not already been running - * in UTC, but in some local time: The first time we set the timezone, - * we will warp the clock so that it is ticking UTC time instead of - * local time. Presumably, if someone is setting the timezone then we - * are running in an environment where the programs understand about - * timezones. This should be done at boot time in the /etc/rc script, - * as soon as possible, so that the clock can be set right. Otherwise, - * various programs will get confused when the clock gets warped. - */ - -int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) -{ - static int firsttime = 1; - int error = 0; - - if (tv && !timespec_valid(tv)) - return -EINVAL; - - error = security_settime(tv, tz); - if (error) - return error; - - if (tz) { - /* SMP safe, global irq locking makes it work. */ - sys_tz = *tz; - update_vsyscall_tz(); - if (firsttime) { - firsttime = 0; - if (!tv) - warp_clock(); - } - } - if (tv) - { - /* SMP safe, again the code in arch/foo/time.c should - * globally block out interrupts when it runs. - */ - return do_settimeofday(tv); - } - return 0; -} - -SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, - struct timezone __user *, tz) -{ - struct timeval user_tv; - struct timespec new_ts; - struct timezone new_tz; - - if (tv) { - if (copy_from_user(&user_tv, tv, sizeof(*tv))) - return -EFAULT; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; - } - if (tz) { - if (copy_from_user(&new_tz, tz, sizeof(*tz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); -} - -SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) -{ - struct timex txc; /* Local copy of parameter */ - int ret; - - /* Copy the user data space into the kernel copy - * structure. But bear in mind that the structures - * may change - */ - if(copy_from_user(&txc, txc_p, sizeof(struct timex))) - return -EFAULT; - ret = do_adjtimex(&txc); - return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; -} - -#ifndef DDE_LINUX -/** - * current_fs_time - Return FS time - * @sb: Superblock. - * - * Return the current time truncated to the time granularity supported by - * the fs. - */ -struct timespec current_fs_time(struct super_block *sb) -{ - struct timespec now = current_kernel_time(); - return timespec_trunc(now, sb->s_time_gran); -} -EXPORT_SYMBOL(current_fs_time); - -/* - * Convert jiffies to milliseconds and back. - * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ -unsigned int inline jiffies_to_msecs(const unsigned long j) -{ -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); -#else -# if BITS_PER_LONG == 32 - return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; -# else - return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; -# endif -#endif -} -EXPORT_SYMBOL(jiffies_to_msecs); - -unsigned int inline jiffies_to_usecs(const unsigned long j) -{ -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); -#else -# if BITS_PER_LONG == 32 - return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; -# else - return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; -# endif -#endif -} -EXPORT_SYMBOL(jiffies_to_usecs); -#endif - -/** - * timespec_trunc - Truncate timespec to a granularity - * @t: Timespec - * @gran: Granularity in ns. - * - * Truncate a timespec to a granularity. gran must be smaller than a second. - * Always rounds down. - * - * This function should be only used for timestamps returned by - * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because - * it doesn't handle the better resolution of the latter. - */ -struct timespec timespec_trunc(struct timespec t, unsigned gran) -{ - /* - * Division is pretty slow so avoid it for common cases. - * Currently current_kernel_time() never returns better than - * jiffies resolution. Exploit that. - */ - if (gran <= jiffies_to_usecs(1) * 1000) { - /* nothing */ - } else if (gran == 1000000000) { - t.tv_nsec = 0; - } else { - t.tv_nsec -= t.tv_nsec % gran; - } - return t; -} -EXPORT_SYMBOL(timespec_trunc); - -#ifndef CONFIG_GENERIC_TIME -/* - * Simulate gettimeofday using do_gettimeofday which only allows a timeval - * and therefore only yields usec accuracy - */ -void getnstimeofday(struct timespec *tv) -{ - struct timeval x; - - do_gettimeofday(&x); - tv->tv_sec = x.tv_sec; - tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; -} -EXPORT_SYMBOL_GPL(getnstimeofday); -#endif - -/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. - * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 - * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. - * - * [For the Julian calendar (which was used in Russia before 1917, - * Britain & colonies before 1752, anywhere else before 1582, - * and is still in use by some communities) leave out the - * -year/100+year/400 terms, and add 10.] - * - * This algorithm was first published by Gauss (I think). - * - * WARNING: this function will overflow on 2106-02-07 06:28:16 on - * machines where long is 32-bit! (However, as time_t is signed, we - * will already get problems at other places on 2038-01-19 03:14:08) - */ -unsigned long -mktime(const unsigned int year0, const unsigned int mon0, - const unsigned int day, const unsigned int hour, - const unsigned int min, const unsigned int sec) -{ - unsigned int mon = mon0, year = year0; - - /* 1..12 -> 11,12,1..10 */ - if (0 >= (int) (mon -= 2)) { - mon += 12; /* Puts Feb last since it has leap day */ - year -= 1; - } - - return ((((unsigned long) - (year/4 - year/100 + year/400 + 367*mon/12 + day) + - year*365 - 719499 - )*24 + hour /* now have hours */ - )*60 + min /* now have minutes */ - )*60 + sec; /* finally seconds */ -} - -EXPORT_SYMBOL(mktime); - -/** - * set_normalized_timespec - set timespec sec and nsec parts and normalize - * - * @ts: pointer to timespec variable to be set - * @sec: seconds to set - * @nsec: nanoseconds to set - * - * Set seconds and nanoseconds field of a timespec variable and - * normalize to the timespec storage format - * - * Note: The tv_nsec part is always in the range of - * 0 <= tv_nsec < NSEC_PER_SEC - * For negative values only the tv_sec field is negative ! - */ -void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) -{ - while (nsec >= NSEC_PER_SEC) { - nsec -= NSEC_PER_SEC; - ++sec; - } - while (nsec < 0) { - nsec += NSEC_PER_SEC; - --sec; - } - ts->tv_sec = sec; - ts->tv_nsec = nsec; -} -EXPORT_SYMBOL(set_normalized_timespec); - -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - -#ifndef DDE_LINUX -/* - * Convert jiffies to milliseconds and back. - * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ -unsigned int jiffies_to_msecs(const unsigned long j) -{ -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); -#else - return (j * MSEC_PER_SEC) / HZ; -#endif -} -EXPORT_SYMBOL(jiffies_to_msecs); - -unsigned int jiffies_to_usecs(const unsigned long j) -{ -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); -#else - return (j * USEC_PER_SEC) / HZ; -#endif -} -EXPORT_SYMBOL(jiffies_to_usecs); - -/* - * When we convert to jiffies then we interpret incoming values - * the following way: - * - * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) - * - * - 'too large' values [that would result in larger than - * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. - * - * - all other values are converted to jiffies by either multiplying - * the input value by a factor or dividing it with a factor - * - * We must also be careful about 32-bit overflows. - */ -unsigned long msecs_to_jiffies(const unsigned int m) -{ - /* - * Negative value, means infinite timeout: - */ - if ((int)m < 0) - return MAX_JIFFY_OFFSET; - -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - /* - * HZ is equal to or smaller than 1000, and 1000 is a nice - * round multiple of HZ, divide with the factor between them, - * but round upwards: - */ - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - /* - * HZ is larger than 1000, and HZ is a nice round multiple of - * 1000 - simply multiply with the factor between them. - * - * But first make sure the multiplication result cannot - * overflow: - */ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return m * (HZ / MSEC_PER_SEC); -#else - /* - * Generic case - multiply, round and divide. But first - * check that if we are doing a net multiplication, that - * we wouldn't overflow: - */ - if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) - >> MSEC_TO_HZ_SHR32; -#endif -} -EXPORT_SYMBOL(msecs_to_jiffies); - -unsigned long usecs_to_jiffies(const unsigned int u) -{ - if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return u * (HZ / USEC_PER_SEC); -#else - return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) - >> USEC_TO_HZ_SHR32; -#endif -} -EXPORT_SYMBOL(usecs_to_jiffies); -#else /* DDE_LINUX */ -unsigned int jiffies_to_msecs(const unsigned long j) -{ - return (j*1000) / HZ; -} -EXPORT_SYMBOL(jiffies_to_msecs); - -unsigned int jiffies_to_usecs(const unsigned long j) -{ - return (j*1000000) / HZ; -} -EXPORT_SYMBOL(jiffies_to_usecs); - -unsigned long msecs_to_jiffies(const unsigned int m) -{ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - return (m * HZ + MSEC_PER_SEC) / MSEC_PER_SEC; -} -EXPORT_SYMBOL(msecs_to_jiffies); - -unsigned long usecs_to_jiffies(const unsigned int u) -{ - if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; -} -EXPORT_SYMBOL(usecs_to_jiffies); -#endif - -/* - * The TICK_NSEC - 1 rounds up the value to the next resolution. Note - * that a remainder subtract here would not do the right thing as the - * resolution values don't fall on second boundries. I.e. the line: - * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. - * - * Rather, we just shift the bits off the right. - * - * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec - * value to a scaled second value. - */ -unsigned long -timespec_to_jiffies(const struct timespec *value) -{ - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - nsec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)nsec * NSEC_CONVERSION) >> - (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; - -} -EXPORT_SYMBOL(timespec_to_jiffies); - -void -jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u32 rem; - value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, - NSEC_PER_SEC, &rem); - value->tv_nsec = rem; -} -EXPORT_SYMBOL(jiffies_to_timespec); - -/* Same for "timeval" - * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. - */ -unsigned long -timeval_to_jiffies(const struct timeval *value) -{ - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; -} -EXPORT_SYMBOL(timeval_to_jiffies); - -void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u32 rem; - - value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, - NSEC_PER_SEC, &rem); - value->tv_usec = rem / NSEC_PER_USEC; -} -EXPORT_SYMBOL(jiffies_to_timeval); - -/* - * Convert jiffies/jiffies_64 to clock_t and back. - */ -clock_t jiffies_to_clock_t(long x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 -# if HZ < USER_HZ - return x * (USER_HZ / HZ); -# else - return x / (HZ / USER_HZ); -# endif -#else - return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); -#endif -} -EXPORT_SYMBOL(jiffies_to_clock_t); - -#ifndef DDE_LINUX -unsigned long clock_t_to_jiffies(unsigned long x) -{ -#if (HZ % USER_HZ)==0 - if (x >= ~0UL / (HZ / USER_HZ)) - return ~0UL; - return x * (HZ / USER_HZ); -#else - /* Don't worry about loss of precision here .. */ - if (x >= ~0UL / HZ * USER_HZ) - return ~0UL; - - /* .. but do try to contain it here */ - return div_u64((u64)x * HZ, USER_HZ); -#endif -} -#else -unsigned long clock_t_to_jiffies(unsigned long x) -{ - if (x >= ~0UL / (HZ / USER_HZ)) - return ~0UL; - return x * (HZ / USER_HZ); -} -#endif /* DDE_LINUX */ -EXPORT_SYMBOL(clock_t_to_jiffies); - -u64 jiffies_64_to_clock_t(u64 x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 -# if HZ < USER_HZ - x = div_u64(x * USER_HZ, HZ); -# elif HZ > USER_HZ - x = div_u64(x, HZ / USER_HZ); -# else - /* Nothing to do */ -# endif -#else - /* - * There are better ways that don't overflow early, - * but even this doesn't overflow in hundreds of years - * in 64 bits, so.. - */ - x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); -#endif - return x; -} -EXPORT_SYMBOL(jiffies_64_to_clock_t); - -u64 nsec_to_clock_t(u64 x) -{ -#if (NSEC_PER_SEC % USER_HZ) == 0 - return div_u64(x, NSEC_PER_SEC / USER_HZ); -#elif (USER_HZ % 512) == 0 - return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); -#else - /* - * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, - * overflow after 64.99 years. - * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... - */ - return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); -#endif -} - -#if (BITS_PER_LONG < 64) -u64 get_jiffies_64(void) -{ - unsigned long seq; - u64 ret; - - do { - seq = read_seqbegin(&xtime_lock); - ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); - return ret; -} -EXPORT_SYMBOL(get_jiffies_64); -#endif - -EXPORT_SYMBOL(jiffies); - -/* - * Add two timespec values and do a safety check for overflow. - * It's assumed that both values are valid (>= 0) - */ -struct timespec timespec_add_safe(const struct timespec lhs, - const struct timespec rhs) -{ - struct timespec res; - - set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, - lhs.tv_nsec + rhs.tv_nsec); - - if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) - res.tv_sec = TIME_T_MAX; - - return res; -} diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/timeconst.pl.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/timeconst.pl.svn-base deleted file mode 100644 index d459895f..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/timeconst.pl.svn-base +++ /dev/null @@ -1,378 +0,0 @@ -#!/usr/bin/perl -# ----------------------------------------------------------------------- -# -# Copyright 2007-2008 rPath, Inc. - All Rights Reserved -# -# This file is part of the Linux kernel, and is made available under -# the terms of the GNU General Public License version 2 or (at your -# option) any later version; incorporated herein by reference. -# -# ----------------------------------------------------------------------- -# - -# -# Usage: timeconst.pl HZ > timeconst.h -# - -# Precomputed values for systems without Math::BigInt -# Generated by: -# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200 -%canned_values = ( - 24 => [ - '0xa6aaaaab','0x2aaaaaa',26, - 125,3, - '0xc49ba5e4','0x1fbe76c8b4',37, - 3,125, - '0xa2c2aaab','0xaaaa',16, - 125000,3, - '0xc9539b89','0x7fffbce4217d',47, - 3,125000, - ], 32 => [ - '0xfa000000','0x6000000',27, - 125,4, - '0x83126e98','0xfdf3b645a',36, - 4,125, - '0xf4240000','0x0',17, - 31250,1, - '0x8637bd06','0x3fff79c842fa',46, - 1,31250, - ], 48 => [ - '0xa6aaaaab','0x6aaaaaa',27, - 125,6, - '0xc49ba5e4','0xfdf3b645a',36, - 6,125, - '0xa2c2aaab','0x15555',17, - 62500,3, - '0xc9539b89','0x3fffbce4217d',46, - 3,62500, - ], 64 => [ - '0xfa000000','0xe000000',28, - 125,8, - '0x83126e98','0x7ef9db22d',35, - 8,125, - '0xf4240000','0x0',18, - 15625,1, - '0x8637bd06','0x1fff79c842fa',45, - 1,15625, - ], 100 => [ - '0xa0000000','0x0',28, - 10,1, - '0xcccccccd','0x733333333',35, - 1,10, - '0x9c400000','0x0',18, - 10000,1, - '0xd1b71759','0x1fff2e48e8a7',45, - 1,10000, - ], 122 => [ - '0x8325c53f','0xfbcda3a',28, - 500,61, - '0xf9db22d1','0x7fbe76c8b',35, - 61,500, - '0x8012e2a0','0x3ef36',18, - 500000,61, - '0xffda4053','0x1ffffbce4217',45, - 61,500000, - ], 128 => [ - '0xfa000000','0x1e000000',29, - 125,16, - '0x83126e98','0x3f7ced916',34, - 16,125, - '0xf4240000','0x40000',19, - 15625,2, - '0x8637bd06','0xfffbce4217d',44, - 2,15625, - ], 200 => [ - '0xa0000000','0x0',29, - 5,1, - '0xcccccccd','0x333333333',34, - 1,5, - '0x9c400000','0x0',19, - 5000,1, - '0xd1b71759','0xfff2e48e8a7',44, - 1,5000, - ], 250 => [ - '0x80000000','0x0',29, - 4,1, - '0x80000000','0x180000000',33, - 1,4, - '0xfa000000','0x0',20, - 4000,1, - '0x83126e98','0x7ff7ced9168',43, - 1,4000, - ], 256 => [ - '0xfa000000','0x3e000000',30, - 125,32, - '0x83126e98','0x1fbe76c8b',33, - 32,125, - '0xf4240000','0xc0000',20, - 15625,4, - '0x8637bd06','0x7ffde7210be',43, - 4,15625, - ], 300 => [ - '0xd5555556','0x2aaaaaaa',30, - 10,3, - '0x9999999a','0x1cccccccc',33, - 3,10, - '0xd0555556','0xaaaaa',20, - 10000,3, - '0x9d495183','0x7ffcb923a29',43, - 3,10000, - ], 512 => [ - '0xfa000000','0x7e000000',31, - 125,64, - '0x83126e98','0xfdf3b645',32, - 64,125, - '0xf4240000','0x1c0000',21, - 15625,8, - '0x8637bd06','0x3ffef39085f',42, - 8,15625, - ], 1000 => [ - '0x80000000','0x0',31, - 1,1, - '0x80000000','0x0',31, - 1,1, - '0xfa000000','0x0',22, - 1000,1, - '0x83126e98','0x1ff7ced9168',41, - 1,1000, - ], 1024 => [ - '0xfa000000','0xfe000000',32, - 125,128, - '0x83126e98','0x7ef9db22',31, - 128,125, - '0xf4240000','0x3c0000',22, - 15625,16, - '0x8637bd06','0x1fff79c842f',41, - 16,15625, - ], 1200 => [ - '0xd5555556','0xd5555555',32, - 5,6, - '0x9999999a','0x66666666',31, - 6,5, - '0xd0555556','0x2aaaaa',22, - 2500,3, - '0x9d495183','0x1ffcb923a29',41, - 3,2500, - ] -); - -$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;'; - -sub bint($) -{ - my($x) = @_; - return Math::BigInt->new($x); -} - -# -# Constants for division by reciprocal multiplication. -# (bits, numerator, denominator) -# -sub fmul($$$) -{ - my ($b,$n,$d) = @_; - - $n = bint($n); - $d = bint($d); - - return scalar (($n << $b)+$d-bint(1))/$d; -} - -sub fadj($$$) -{ - my($b,$n,$d) = @_; - - $n = bint($n); - $d = bint($d); - - $d = $d/bgcd($n, $d); - return scalar (($d-bint(1)) << $b)/$d; -} - -sub fmuls($$$) { - my($b,$n,$d) = @_; - my($s,$m); - my($thres) = bint(1) << ($b-1); - - $n = bint($n); - $d = bint($d); - - for ($s = 0; 1; $s++) { - $m = fmul($s,$n,$d); - return $s if ($m >= $thres); - } - return 0; -} - -# Generate a hex value if the result fits in 64 bits; -# otherwise skip. -sub bignum_hex($) { - my($x) = @_; - my $s = $x->as_hex(); - - return (length($s) > 18) ? undef : $s; -} - -# Provides mul, adj, and shr factors for a specific -# (bit, time, hz) combination -sub muladj($$$) { - my($b, $t, $hz) = @_; - my $s = fmuls($b, $t, $hz); - my $m = fmul($s, $t, $hz); - my $a = fadj($s, $t, $hz); - return (bignum_hex($m), bignum_hex($a), $s); -} - -# Provides numerator, denominator values -sub numden($$) { - my($n, $d) = @_; - my $g = bgcd($n, $d); - return ($n/$g, $d/$g); -} - -# All values for a specific (time, hz) combo -sub conversions($$) { - my ($t, $hz) = @_; - my @val = (); - - # HZ_TO_xx - push(@val, muladj(32, $t, $hz)); - push(@val, numden($t, $hz)); - - # xx_TO_HZ - push(@val, muladj(32, $hz, $t)); - push(@val, numden($hz, $t)); - - return @val; -} - -sub compute_values($) { - my($hz) = @_; - my @val = (); - my $s, $m, $a, $g; - - if (!$has_bigint) { - die "$0: HZ == $hz not canned and ". - "Math::BigInt not available\n"; - } - - # MSEC conversions - push(@val, conversions(1000, $hz)); - - # USEC conversions - push(@val, conversions(1000000, $hz)); - - return @val; -} - -sub outputval($$) -{ - my($name, $val) = @_; - my $csuf; - - if (defined($val)) { - if ($name !~ /SHR/) { - $val = "U64_C($val)"; - } - printf "#define %-23s %s\n", $name.$csuf, $val.$csuf; - } -} - -sub output($@) -{ - my($hz, @val) = @_; - my $pfx, $bit, $suf, $s, $m, $a; - - print "/* Automatically generated by kernel/timeconst.pl */\n"; - print "/* Conversion constants for HZ == $hz */\n"; - print "\n"; - print "#ifndef KERNEL_TIMECONST_H\n"; - print "#define KERNEL_TIMECONST_H\n"; - print "\n"; - - print "#include <linux/param.h>\n"; - print "#include <linux/types.h>\n"; - - print "\n"; - print "#if HZ != $hz && !defined(DDE_LINUX)\n"; - print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n"; - print "#endif\n"; - print "\n"; - - foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', - 'HZ_TO_USEC','USEC_TO_HZ') { - foreach $bit (32) { - foreach $suf ('MUL', 'ADJ', 'SHR') { - outputval("${pfx}_$suf$bit", shift(@val)); - } - } - foreach $suf ('NUM', 'DEN') { - outputval("${pfx}_$suf", shift(@val)); - } - } - - print "\n"; - print "#endif /* KERNEL_TIMECONST_H */\n"; -} - -# Pretty-print Perl values -sub perlvals(@) { - my $v; - my @l = (); - - foreach $v (@_) { - if (!defined($v)) { - push(@l, 'undef'); - } elsif ($v =~ /^0x/) { - push(@l, "\'".$v."\'"); - } else { - push(@l, $v.''); - } - } - return join(',', @l); -} - -($hz) = @ARGV; - -# Use this to generate the %canned_values structure -if ($hz eq '--can') { - shift(@ARGV); - @hzlist = sort {$a <=> $b} (@ARGV); - - print "# Precomputed values for systems without Math::BigInt\n"; - print "# Generated by:\n"; - print "# timeconst.pl --can ", join(' ', @hzlist), "\n"; - print "\%canned_values = (\n"; - my $pf = "\t"; - foreach $hz (@hzlist) { - my @values = compute_values($hz); - print "$pf$hz => [\n"; - while (scalar(@values)) { - my $bit; - foreach $bit (32) { - my $m = shift(@values); - my $a = shift(@values); - my $s = shift(@values); - print "\t\t", perlvals($m,$a,$s), ",\n"; - } - my $n = shift(@values); - my $d = shift(@values); - print "\t\t", perlvals($n,$d), ",\n"; - } - print "\t]"; - $pf = ', '; - } - print "\n);\n"; -} else { - $hz += 0; # Force to number - if ($hz < 1) { - die "Usage: $0 HZ\n"; - } - - @val = @{$canned_values{$hz}}; - if (!defined(@val)) { - @val = compute_values($hz); - } - output($hz, @val); -} -exit 0; diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/timer.c.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/timer.c.svn-base deleted file mode 100644 index 3af77924..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/timer.c.svn-base +++ /dev/null @@ -1,1588 +0,0 @@ -/* - * linux/kernel/timer.c - * - * Kernel internal timers, basic process system calls - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. - * - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). - * Copyright (C) 1998 Andrea Arcangeli - * 1999-03-10 Improved NTP compatibility by Ulrich Windl - * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love - * 2000-10-05 Implemented scalable SMP per-CPU timer handling. - * Copyright (C) 2000, 2001, 2002 Ingo Molnar - * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar - */ - -#include <linux/kernel_stat.h> -#include <linux/module.h> -#include <linux/interrupt.h> -#include <linux/percpu.h> -#include <linux/init.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/pid_namespace.h> -#include <linux/notifier.h> -#include <linux/thread_info.h> -#include <linux/time.h> -#include <linux/jiffies.h> -#include <linux/posix-timers.h> -#include <linux/cpu.h> -#include <linux/syscalls.h> -#include <linux/delay.h> -#include <linux/tick.h> -#include <linux/kallsyms.h> - -#include <asm/uaccess.h> -#include <asm/unistd.h> -#include <asm/div64.h> -#include <asm/timex.h> -#include <asm/io.h> - -#ifndef DDE_LINUX - -u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - -/* - * per-CPU timer vector definitions: - */ -#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) -#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) - -struct tvec { - struct list_head vec[TVN_SIZE]; -}; - -struct tvec_root { - struct list_head vec[TVR_SIZE]; -}; - -struct tvec_base { - spinlock_t lock; - struct timer_list *running_timer; - unsigned long timer_jiffies; - struct tvec_root tv1; - struct tvec tv2; - struct tvec tv3; - struct tvec tv4; - struct tvec tv5; -} ____cacheline_aligned; - -struct tvec_base boot_tvec_bases; -EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; - -/* - * Note that all tvec_bases are 2 byte aligned and lower bit of - * base in timer_list is guaranteed to be zero. Use the LSB for - * the new flag to indicate whether the timer is deferrable - */ -#define TBASE_DEFERRABLE_FLAG (0x1) - -/* Functions below help us manage 'deferrable' flag */ -static inline unsigned int tbase_get_deferrable(struct tvec_base *base) -{ - return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); -} - -static inline struct tvec_base *tbase_get_base(struct tvec_base *base) -{ - return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); -} - -static inline void timer_set_deferrable(struct timer_list *timer) -{ - timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | - TBASE_DEFERRABLE_FLAG)); -} - -static inline void -timer_set_base(struct timer_list *timer, struct tvec_base *new_base) -{ - timer->base = (struct tvec_base *)((unsigned long)(new_base) | - tbase_get_deferrable(timer->base)); -} -#endif /* DDE_LINUX */ - -static unsigned long round_jiffies_common(unsigned long j, int cpu, - bool force_up) -{ - int rem; - unsigned long original = j; - - /* - * We don't want all cpus firing their timers at once hitting the - * same lock or cachelines, so we skew each extra cpu with an extra - * 3 jiffies. This 3 jiffies came originally from the mm/ code which - * already did this. - * The skew is done by adding 3*cpunr, then round, then subtract this - * extra offset again. - */ - j += cpu * 3; - - rem = j % HZ; - - /* - * If the target jiffie is just after a whole second (which can happen - * due to delays of the timer irq, long irq off times etc etc) then - * we should round down to the whole second, not up. Use 1/4th second - * as cutoff for this rounding as an extreme upper bound for this. - * But never round down if @force_up is set. - */ - if (rem < HZ/4 && !force_up) /* round down */ - j = j - rem; - else /* round up */ - j = j - rem + HZ; - - /* now that we have rounded, subtract the extra skew again */ - j -= cpu * 3; - - if (j <= jiffies) /* rounding ate our timeout entirely; */ - return original; - return j; -} - -/** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, false); -} -EXPORT_SYMBOL_GPL(__round_jiffies); - -/** - * __round_jiffies_relative - function to round jiffies to a full second - * @j: the time in (relative) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies_relative() rounds a time delta in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies_relative(unsigned long j, int cpu) -{ - unsigned long j0 = jiffies; - - /* Use j0 because jiffies might change while we run */ - return round_jiffies_common(j + j0, cpu, false) - j0; -} -EXPORT_SYMBOL_GPL(__round_jiffies_relative); - -/** - * round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * - * round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long round_jiffies(unsigned long j) -{ - return round_jiffies_common(j, raw_smp_processor_id(), false); -} -EXPORT_SYMBOL_GPL(round_jiffies); - -/** - * round_jiffies_relative - function to round jiffies to a full second - * @j: the time in (relative) jiffies that should be rounded - * - * round_jiffies_relative() rounds a time delta in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long round_jiffies_relative(unsigned long j) -{ - return __round_jiffies_relative(j, raw_smp_processor_id()); -} -EXPORT_SYMBOL_GPL(round_jiffies_relative); - -/** - * __round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, true); -} -EXPORT_SYMBOL_GPL(__round_jiffies_up); - -/** - * __round_jiffies_up_relative - function to round jiffies up to a full second - * @j: the time in (relative) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies_relative() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) -{ - unsigned long j0 = jiffies; - - /* Use j0 because jiffies might change while we run */ - return round_jiffies_common(j + j0, cpu, true) - j0; -} -EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); - -/** - * round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * - * This is the same as round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long round_jiffies_up(unsigned long j) -{ - return round_jiffies_common(j, raw_smp_processor_id(), true); -} -EXPORT_SYMBOL_GPL(round_jiffies_up); - -/** - * round_jiffies_up_relative - function to round jiffies up to a full second - * @j: the time in (relative) jiffies that should be rounded - * - * This is the same as round_jiffies_relative() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long round_jiffies_up_relative(unsigned long j) -{ - return __round_jiffies_up_relative(j, raw_smp_processor_id()); -} -EXPORT_SYMBOL_GPL(round_jiffies_up_relative); - - -#ifndef DDE_LINUX -static inline void set_running_timer(struct tvec_base *base, - struct timer_list *timer) -{ -#ifdef CONFIG_SMP - base->running_timer = timer; -#endif -} - -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) -{ - unsigned long expires = timer->expires; - unsigned long idx = expires - base->timer_jiffies; - struct list_head *vec; - - if (idx < TVR_SIZE) { - int i = expires & TVR_MASK; - vec = base->tv1.vec + i; - } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { - int i = (expires >> TVR_BITS) & TVN_MASK; - vec = base->tv2.vec + i; - } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - vec = base->tv3.vec + i; - } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - vec = base->tv4.vec + i; - } else if ((signed long) idx < 0) { - /* - * Can happen if you add a timer with expires == jiffies, - * or you set a timer to go off in the past - */ - vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); - } else { - int i; - /* If the timeout is larger than 0xffffffff on 64-bit - * architectures then we use the maximum timeout: - */ - if (idx > 0xffffffffUL) { - idx = 0xffffffffUL; - expires = idx + base->timer_jiffies; - } - i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - vec = base->tv5.vec + i; - } - /* - * Timers are FIFO: - */ - list_add_tail(&timer->entry, vec); -} - -#ifdef CONFIG_TIMER_STATS -void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) -{ - if (timer->start_site) - return; - - timer->start_site = addr; - memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); - timer->start_pid = current->pid; -} - -static void timer_stats_account_timer(struct timer_list *timer) -{ - unsigned int flag = 0; - - if (unlikely(tbase_get_deferrable(timer->base))) - flag |= TIMER_STATS_FLAG_DEFERRABLE; - - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, flag); -} - -#else -static void timer_stats_account_timer(struct timer_list *timer) {} -#endif - -#ifdef CONFIG_DEBUG_OBJECTS_TIMERS - -static struct debug_obj_descr timer_debug_descr; - -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int timer_fixup_init(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); - debug_object_init(timer, &timer_debug_descr); - return 1; - default: - return 0; - } -} - -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static int timer_fixup_activate(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (timer->entry.next == NULL && - timer->entry.prev == TIMER_ENTRY_STATIC) { - debug_object_init(timer, &timer_debug_descr); - debug_object_activate(timer, &timer_debug_descr); - return 0; - } else { - WARN_ON_ONCE(1); - } - return 0; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int timer_fixup_free(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); - debug_object_free(timer, &timer_debug_descr); - return 1; - default: - return 0; - } -} - -static struct debug_obj_descr timer_debug_descr = { - .name = "timer_list", - .fixup_init = timer_fixup_init, - .fixup_activate = timer_fixup_activate, - .fixup_free = timer_fixup_free, -}; - -static inline void debug_timer_init(struct timer_list *timer) -{ - debug_object_init(timer, &timer_debug_descr); -} - -static inline void debug_timer_activate(struct timer_list *timer) -{ - debug_object_activate(timer, &timer_debug_descr); -} - -static inline void debug_timer_deactivate(struct timer_list *timer) -{ - debug_object_deactivate(timer, &timer_debug_descr); -} - -static inline void debug_timer_free(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} - -static void __init_timer(struct timer_list *timer); - -void init_timer_on_stack(struct timer_list *timer) -{ - debug_object_init_on_stack(timer, &timer_debug_descr); - __init_timer(timer); -} -EXPORT_SYMBOL_GPL(init_timer_on_stack); - -void destroy_timer_on_stack(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} -EXPORT_SYMBOL_GPL(destroy_timer_on_stack); - -#else -static inline void debug_timer_init(struct timer_list *timer) { } -static inline void debug_timer_activate(struct timer_list *timer) { } -static inline void debug_timer_deactivate(struct timer_list *timer) { } -#endif - -static void __init_timer(struct timer_list *timer) -{ - timer->entry.next = NULL; - timer->base = __raw_get_cpu_var(tvec_bases); -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; - timer->start_pid = -1; - memset(timer->start_comm, 0, TASK_COMM_LEN); -#endif -} - -/** - * init_timer - initialize a timer. - * @timer: the timer to be initialized - * - * init_timer() must be done to a timer prior calling *any* of the - * other timer functions. - */ -void init_timer(struct timer_list *timer) -{ - debug_timer_init(timer); - __init_timer(timer); -} -EXPORT_SYMBOL(init_timer); - -void init_timer_deferrable(struct timer_list *timer) -{ - init_timer(timer); - timer_set_deferrable(timer); -} -EXPORT_SYMBOL(init_timer_deferrable); - -static inline void detach_timer(struct timer_list *timer, - int clear_pending) -{ - struct list_head *entry = &timer->entry; - - debug_timer_deactivate(timer); - - __list_del(entry->prev, entry->next); - if (clear_pending) - entry->next = NULL; - entry->prev = LIST_POISON2; -} - -/* - * We are using hashed locking: holding per_cpu(tvec_bases).lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. - * - * So __run_timers/migrate_timers can safely modify all timers which could - * be found on ->tvX lists. - * - * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. - */ -static struct tvec_base *lock_timer_base(struct timer_list *timer, - unsigned long *flags) - __acquires(timer->base->lock) -{ - struct tvec_base *base; - - for (;;) { - struct tvec_base *prelock_base = timer->base; - base = tbase_get_base(prelock_base); - if (likely(base != NULL)) { - spin_lock_irqsave(&base->lock, *flags); - if (likely(prelock_base == timer->base)) - return base; - /* The timer has migrated to another CPU */ - spin_unlock_irqrestore(&base->lock, *flags); - } - cpu_relax(); - } -} - -int __mod_timer(struct timer_list *timer, unsigned long expires) -{ - struct tvec_base *base, *new_base; - unsigned long flags; - int ret = 0; - - timer_stats_timer_set_start_info(timer); - BUG_ON(!timer->function); - - base = lock_timer_base(timer, &flags); - - if (timer_pending(timer)) { - detach_timer(timer, 0); - ret = 1; - } - - debug_timer_activate(timer); - - new_base = __get_cpu_var(tvec_bases); - - if (base != new_base) { - /* - * We are trying to schedule the timer on the local CPU. - * However we can't change timer's base while it is running, - * otherwise del_timer_sync() can't detect that the timer's - * handler yet has not finished. This also guarantees that - * the timer is serialized wrt itself. - */ - if (likely(base->running_timer != timer)) { - /* See the comment in lock_timer_base() */ - timer_set_base(timer, NULL); - spin_unlock(&base->lock); - base = new_base; - spin_lock(&base->lock); - timer_set_base(timer, base); - } - } - - timer->expires = expires; - internal_add_timer(base, timer); - spin_unlock_irqrestore(&base->lock, flags); - - return ret; -} - -EXPORT_SYMBOL(__mod_timer); - -/** - * add_timer_on - start a timer on a particular CPU - * @timer: the timer to be added - * @cpu: the CPU to start it on - * - * This is not very scalable on SMP. Double adds are not possible. - */ -void add_timer_on(struct timer_list *timer, int cpu) -{ - struct tvec_base *base = per_cpu(tvec_bases, cpu); - unsigned long flags; - - timer_stats_timer_set_start_info(timer); - BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->lock, flags); - timer_set_base(timer, base); - debug_timer_activate(timer); - internal_add_timer(base, timer); - /* - * Check whether the other CPU is idle and needs to be - * triggered to reevaluate the timer wheel when nohz is - * active. We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to idle can not evaluate - * the timer wheel. - */ - wake_up_idle_cpu(cpu); - spin_unlock_irqrestore(&base->lock, flags); -} - -/** - * mod_timer - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer() is a more efficient way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * - * mod_timer(timer, expires) is equivalent to: - * - * del_timer(timer); timer->expires = expires; add_timer(timer); - * - * Note that if there are multiple unserialized concurrent users of the - * same timer, then mod_timer() is the only safe way to modify the timeout, - * since add_timer() cannot modify an already running timer. - * - * The function returns whether it has modified a pending timer or not. - * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an - * active timer returns 1.) - */ -int mod_timer(struct timer_list *timer, unsigned long expires) -{ - BUG_ON(!timer->function); - - timer_stats_timer_set_start_info(timer); - /* - * This is a common optimization triggered by the - * networking code - if the timer is re-modified - * to be the same thing then just return: - */ - if (timer->expires == expires && timer_pending(timer)) - return 1; - - return __mod_timer(timer, expires); -} - -EXPORT_SYMBOL(mod_timer); - -/** - * del_timer - deactive a timer. - * @timer: the timer to be deactivated - * - * del_timer() deactivates a timer - this works on both active and inactive - * timers. - * - * The function returns whether it has deactivated a pending timer or not. - * (ie. del_timer() of an inactive timer returns 0, del_timer() of an - * active timer returns 1.) - */ -int del_timer(struct timer_list *timer) -{ - struct tvec_base *base; - unsigned long flags; - int ret = 0; - - timer_stats_timer_clear_start_info(timer); - if (timer_pending(timer)) { - base = lock_timer_base(timer, &flags); - if (timer_pending(timer)) { - detach_timer(timer, 1); - ret = 1; - } - spin_unlock_irqrestore(&base->lock, flags); - } - - return ret; -} - -EXPORT_SYMBOL(del_timer); - -#ifdef CONFIG_SMP -/** - * try_to_del_timer_sync - Try to deactivate a timer - * @timer: timer do del - * - * This function tries to deactivate a timer. Upon successful (ret >= 0) - * exit the timer is not queued and the handler is not running on any CPU. - * - * It must not be called from interrupt contexts. - */ -int try_to_del_timer_sync(struct timer_list *timer) -{ - struct tvec_base *base; - unsigned long flags; - int ret = -1; - - base = lock_timer_base(timer, &flags); - - if (base->running_timer == timer) - goto out; - - ret = 0; - if (timer_pending(timer)) { - detach_timer(timer, 1); - ret = 1; - } -out: - spin_unlock_irqrestore(&base->lock, flags); - - return ret; -} - -EXPORT_SYMBOL(try_to_del_timer_sync); - -/** - * del_timer_sync - deactivate a timer and wait for the handler to finish. - * @timer: the timer to be deactivated - * - * This function only differs from del_timer() on SMP: besides deactivating - * the timer it also makes sure the handler has finished executing on other - * CPUs. - * - * Synchronization rules: Callers must prevent restarting of the timer, - * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which would prevent - * completion of the timer's handler. The timer's handler must not call - * add_timer_on(). Upon exit the timer is not queued and the handler is - * not running on any CPU. - * - * The function returns whether it has deactivated a pending timer or not. - */ -int del_timer_sync(struct timer_list *timer) -{ - for (;;) { - int ret = try_to_del_timer_sync(timer); - if (ret >= 0) - return ret; - cpu_relax(); - } -} - -EXPORT_SYMBOL(del_timer_sync); -#endif - -static int cascade(struct tvec_base *base, struct tvec *tv, int index) -{ - /* cascade all the timers from tv up one level */ - struct timer_list *timer, *tmp; - struct list_head tv_list; - - list_replace_init(tv->vec + index, &tv_list); - - /* - * We are removing _all_ timers from the list, so we - * don't have to detach them individually. - */ - list_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(tbase_get_base(timer->base) != base); - internal_add_timer(base, timer); - } - - return index; -} - -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) - -/** - * __run_timers - run all expired timers (if any) on this CPU. - * @base: the timer vector to be processed. - * - * This function cascades all vectors and executes all expired timer - * vectors. - */ -static inline void __run_timers(struct tvec_base *base) -{ - struct timer_list *timer; - - spin_lock_irq(&base->lock); - while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list; - struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; - - /* - * Cascade timers: - */ - if (!index && - (!cascade(base, &base->tv2, INDEX(0))) && - (!cascade(base, &base->tv3, INDEX(1))) && - !cascade(base, &base->tv4, INDEX(2))) - cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, &work_list); - while (!list_empty(head)) { - void (*fn)(unsigned long); - unsigned long data; - - timer = list_first_entry(head, struct timer_list,entry); - fn = timer->function; - data = timer->data; - - timer_stats_account_timer(timer); - - set_running_timer(base, timer); - detach_timer(timer, 1); - spin_unlock_irq(&base->lock); - { - int preempt_count = preempt_count(); - fn(data); - if (preempt_count != preempt_count()) { - printk(KERN_ERR "huh, entered %p " - "with preempt_count %08x, exited" - " with %08x?\n", - fn, preempt_count, - preempt_count()); - BUG(); - } - } - spin_lock_irq(&base->lock); - } - } - set_running_timer(base, NULL); - spin_unlock_irq(&base->lock); -} - -#ifdef CONFIG_NO_HZ -/* - * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a cpus is idle. - * This functions needs to be called disabled. - */ -static unsigned long __next_timer_interrupt(struct tvec_base *base) -{ - unsigned long timer_jiffies = base->timer_jiffies; - unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; - int index, slot, array, found = 0; - struct timer_list *nte; - struct tvec *varray[4]; - - /* Look for timer events in tv1. */ - index = slot = timer_jiffies & TVR_MASK; - do { - list_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) - continue; - - found = 1; - expires = nte->expires; - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - goto cascade; - return expires; - } - slot = (slot + 1) & TVR_MASK; - } while (slot != index); - -cascade: - /* Calculate the next cascade event */ - if (index) - timer_jiffies += TVR_SIZE - index; - timer_jiffies >>= TVR_BITS; - - /* Check tv2-tv5. */ - varray[0] = &base->tv2; - varray[1] = &base->tv3; - varray[2] = &base->tv4; - varray[3] = &base->tv5; - - for (array = 0; array < 4; array++) { - struct tvec *varp = varray[array]; - - index = slot = timer_jiffies & TVN_MASK; - do { - list_for_each_entry(nte, varp->vec + slot, entry) { - found = 1; - if (time_before(nte->expires, expires)) - expires = nte->expires; - } - /* - * Do we still search for the first timer or are - * we looking up the cascade buckets ? - */ - if (found) { - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - break; - return expires; - } - slot = (slot + 1) & TVN_MASK; - } while (slot != index); - - if (index) - timer_jiffies += TVN_SIZE - index; - timer_jiffies >>= TVN_BITS; - } - return expires; -} - -/* - * Check, if the next hrtimer event is before the next timer wheel - * event: - */ -static unsigned long cmp_next_hrtimer_event(unsigned long now, - unsigned long expires) -{ - ktime_t hr_delta = hrtimer_get_next_event(); - struct timespec tsdelta; - unsigned long delta; - - if (hr_delta.tv64 == KTIME_MAX) - return expires; - - /* - * Expired timer available, let it expire in the next tick - */ - if (hr_delta.tv64 <= 0) - return now + 1; - - tsdelta = ktime_to_timespec(hr_delta); - delta = timespec_to_jiffies(&tsdelta); - - /* - * Limit the delta to the max value, which is checked in - * tick_nohz_stop_sched_tick(): - */ - if (delta > NEXT_TIMER_MAX_DELTA) - delta = NEXT_TIMER_MAX_DELTA; - - /* - * Take rounding errors in to account and make sure, that it - * expires in the next tick. Otherwise we go into an endless - * ping pong due to tick_nohz_stop_sched_tick() retriggering - * the timer softirq - */ - if (delta < 1) - delta = 1; - now += delta; - if (time_before(now, expires)) - return now; - return expires; -} - -/** - * get_next_timer_interrupt - return the jiffy of the next pending timer - * @now: current time (in jiffies) - */ -unsigned long get_next_timer_interrupt(unsigned long now) -{ - struct tvec_base *base = __get_cpu_var(tvec_bases); - unsigned long expires; - - spin_lock(&base->lock); - expires = __next_timer_interrupt(base); - spin_unlock(&base->lock); - - if (time_before_eq(expires, now)) - return now; - - return cmp_next_hrtimer_event(now, expires); -} -#endif - -/* - * Called from the timer interrupt handler to charge one tick to the current - * process. user_tick is 1 if the tick is user time, 0 for system. - */ -void update_process_times(int user_tick) -{ - struct task_struct *p = current; - int cpu = smp_processor_id(); - - /* Note: this timer irq context must be accounted for as well. */ - account_process_tick(p, user_tick); - run_local_timers(); - if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, user_tick); - printk_tick(); - scheduler_tick(); - run_posix_cpu_timers(p); -} - -/* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ - return nr_active() * FIXED_1; -} - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - * - * Requires xtime_lock to access. - */ -unsigned long avenrun[3]; - -EXPORT_SYMBOL(avenrun); - -/* - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. - */ -static inline void calc_load(unsigned long ticks) -{ - unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - - count -= ticks; - if (unlikely(count < 0)) { - active_tasks = count_active_tasks(); - do { - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - count += LOAD_FREQ; - } while (count < 0); - } -} - -/* - * This function runs timers and the timer-tq in bottom half context. - */ -static void run_timer_softirq(struct softirq_action *h) -{ - struct tvec_base *base = __get_cpu_var(tvec_bases); - - hrtimer_run_pending(); - - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); -} - -/* - * Called by the local, per-CPU timer interrupt on SMP. - */ -void run_local_timers(void) -{ - hrtimer_run_queues(); - raise_softirq(TIMER_SOFTIRQ); - softlockup_tick(); -} - -/* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! - */ -static inline void update_times(unsigned long ticks) -{ - update_wall_time(); - calc_load(ticks); -} - -/* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. - * jiffies is defined in the linker script... - */ - -void do_timer(unsigned long ticks) -{ - jiffies_64 += ticks; - update_times(ticks); -} - -#ifdef __ARCH_WANT_SYS_ALARM - -/* - * For backwards compatibility? This can be done in libc so Alpha - * and all newer ports shouldn't need it. - */ -SYSCALL_DEFINE1(alarm, unsigned int, seconds) -{ - return alarm_setitimer(seconds); -} - -#endif - -#ifndef __alpha__ - -/* - * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this - * should be moved into arch/i386 instead? - */ - -/** - * sys_getpid - return the thread group id of the current process - * - * Note, despite the name, this returns the tgid not the pid. The tgid and - * the pid are identical unless CLONE_THREAD was specified on clone() in - * which case the tgid is the same in all threads of the same group. - * - * This is SMP safe as current->tgid does not change. - */ -SYSCALL_DEFINE0(getpid) -{ - return task_tgid_vnr(current); -} - -/* - * Accessing ->real_parent is not SMP-safe, it could - * change from under us. However, we can use a stale - * value of ->real_parent under rcu_read_lock(), see - * release_task()->call_rcu(delayed_put_task_struct). - */ -SYSCALL_DEFINE0(getppid) -{ - int pid; - - rcu_read_lock(); - pid = task_tgid_vnr(current->real_parent); - rcu_read_unlock(); - - return pid; -} - -SYSCALL_DEFINE0(getuid) -{ - /* Only we change this so SMP safe */ - return current_uid(); -} - -SYSCALL_DEFINE0(geteuid) -{ - /* Only we change this so SMP safe */ - return current_euid(); -} - -SYSCALL_DEFINE0(getgid) -{ - /* Only we change this so SMP safe */ - return current_gid(); -} - -SYSCALL_DEFINE0(getegid) -{ - /* Only we change this so SMP safe */ - return current_egid(); -} - -#endif - -static void process_timeout(unsigned long __data) -{ - wake_up_process((struct task_struct *)__data); -} - -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. - * - * In all cases the return value is guaranteed to be non-negative. - */ -signed long __sched schedule_timeout(signed long timeout) -{ - struct timer_list timer; - unsigned long expire; - - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx\n", timeout); - dump_stack(); - current->state = TASK_RUNNING; - goto out; - } - } - - expire = timeout + jiffies; - - setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire); - schedule(); - del_singleshot_timer_sync(&timer); - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer); - - timeout = expire - jiffies; - - out: - return timeout < 0 ? 0 : timeout; -} -EXPORT_SYMBOL(schedule_timeout); - -/* - * We can use __set_current_state() here because schedule_timeout() calls - * schedule() unconditionally. - */ -signed long __sched schedule_timeout_interruptible(signed long timeout) -{ - __set_current_state(TASK_INTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_interruptible); - -signed long __sched schedule_timeout_killable(signed long timeout) -{ - __set_current_state(TASK_KILLABLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_killable); - -signed long __sched schedule_timeout_uninterruptible(signed long timeout) -{ - __set_current_state(TASK_UNINTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_uninterruptible); - -/* Thread ID - the internal kernel "pid" */ -SYSCALL_DEFINE0(gettid) -{ - return task_pid_vnr(current); -} - -/** - * do_sysinfo - fill in sysinfo struct - * @info: pointer to buffer to fill - */ -int do_sysinfo(struct sysinfo *info) -{ - unsigned long mem_total, sav_total; - unsigned int mem_unit, bitcount; - unsigned long seq; - - memset(info, 0, sizeof(struct sysinfo)); - - do { - struct timespec tp; - seq = read_seqbegin(&xtime_lock); - - /* - * This is annoying. The below is the same thing - * posix_get_clock_monotonic() does, but it wants to - * take the lock which we want to cover the loads stuff - * too. - */ - - getnstimeofday(&tp); - tp.tv_sec += wall_to_monotonic.tv_sec; - tp.tv_nsec += wall_to_monotonic.tv_nsec; - monotonic_to_bootbased(&tp); - if (tp.tv_nsec - NSEC_PER_SEC >= 0) { - tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; - tp.tv_sec++; - } - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - - info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - - info->procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); - - si_meminfo(info); - si_swapinfo(info); - - /* - * If the sum of all the available memory (i.e. ram + swap) - * is less than can be stored in a 32 bit unsigned long then - * we can be binary compatible with 2.2.x kernels. If not, - * well, in that case 2.2.x was broken anyways... - * - * -Erik Andersen <andersee@debian.org> - */ - - mem_total = info->totalram + info->totalswap; - if (mem_total < info->totalram || mem_total < info->totalswap) - goto out; - bitcount = 0; - mem_unit = info->mem_unit; - while (mem_unit > 1) { - bitcount++; - mem_unit >>= 1; - sav_total = mem_total; - mem_total <<= 1; - if (mem_total < sav_total) - goto out; - } - - /* - * If mem_total did not overflow, multiply all memory values by - * info->mem_unit and set it to 1. This leaves things compatible - * with 2.2.x, and also retains compatibility with earlier 2.4.x - * kernels... - */ - - info->mem_unit = 1; - info->totalram <<= bitcount; - info->freeram <<= bitcount; - info->sharedram <<= bitcount; - info->bufferram <<= bitcount; - info->totalswap <<= bitcount; - info->freeswap <<= bitcount; - info->totalhigh <<= bitcount; - info->freehigh <<= bitcount; - -out: - return 0; -} - -SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) -{ - struct sysinfo val; - - do_sysinfo(&val); - - if (copy_to_user(info, &val, sizeof(struct sysinfo))) - return -EFAULT; - - return 0; -} - -static int __cpuinit init_timers_cpu(int cpu) -{ - int j; - struct tvec_base *base; - static char __cpuinitdata tvec_base_done[NR_CPUS]; - - if (!tvec_base_done[cpu]) { - static char boot_done; - - if (boot_done) { - /* - * The APs use this path later in boot - */ - base = kmalloc_node(sizeof(*base), - GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); - if (!base) - return -ENOMEM; - - /* Make sure that tvec_base is 2 byte aligned */ - if (tbase_get_deferrable(base)) { - WARN_ON(1); - kfree(base); - return -ENOMEM; - } - per_cpu(tvec_bases, cpu) = base; - } else { - /* - * This is for the boot CPU - we use compile-time - * static initialisation because per-cpu memory isn't - * ready yet and because the memory allocators are not - * initialised either. - */ - boot_done = 1; - base = &boot_tvec_bases; - } - tvec_base_done[cpu] = 1; - } else { - base = per_cpu(tvec_bases, cpu); - } - - spin_lock_init(&base->lock); - - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); - - base->timer_jiffies = jiffies; - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) -{ - struct timer_list *timer; - - while (!list_empty(head)) { - timer = list_first_entry(head, struct timer_list, entry); - detach_timer(timer, 0); - timer_set_base(timer, new_base); - internal_add_timer(new_base, timer); - } -} - -static void __cpuinit migrate_timers(int cpu) -{ - struct tvec_base *old_base; - struct tvec_base *new_base; - int i; - - BUG_ON(cpu_online(cpu)); - old_base = per_cpu(tvec_bases, cpu); - new_base = get_cpu_var(tvec_bases); - /* - * The caller is globally serialized and nobody else - * takes two locks at once, deadlock is not possible. - */ - spin_lock_irq(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - - BUG_ON(old_base->running_timer); - - for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); - for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); - } - - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); - put_cpu_var(tvec_bases); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -static int __cpuinit timer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (init_timers_cpu(cpu) < 0) - return NOTIFY_BAD; - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_timers(cpu); - break; -#endif - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata timers_nb = { - .notifier_call = timer_cpu_notify, -}; - - -void __init init_timers(void) -{ - int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - - init_timer_stats(); - - BUG_ON(err == NOTIFY_BAD); - register_cpu_notifier(&timers_nb); - open_softirq(TIMER_SOFTIRQ, run_timer_softirq); -} - -/** - * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for - */ -void msleep(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -} - -EXPORT_SYMBOL(msleep); -#endif /* DDE */ - -/** - * msleep_interruptible - sleep waiting for signals - * @msecs: Time in milliseconds to sleep for - */ -unsigned long msleep_interruptible(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); - return jiffies_to_msecs(timeout); -} - -EXPORT_SYMBOL(msleep_interruptible); diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/wait.c.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/wait.c.svn-base deleted file mode 100644 index b10d867f..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/wait.c.svn-base +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Generic waiting primitives. - * - * (C) 2004 William Irwin, Oracle - */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/wait.h> -#include <linux/hash.h> - -#ifdef DDE_LINUX -#include "local.h" -#endif - -void init_waitqueue_head(wait_queue_head_t *q) -{ - spin_lock_init(&q->lock); - INIT_LIST_HEAD(&q->task_list); -} - -EXPORT_SYMBOL(init_waitqueue_head); - -void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(add_wait_queue); - -void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_tail(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(add_wait_queue_exclusive); - -void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(remove_wait_queue); - - -/* - * Note: we use "set_current_state()" _after_ the wait-queue add, - * because we need a memory barrier there on SMP, so that any - * wake-function that tests for the wait-queue being active - * will be guaranteed to see waitqueue addition _or_ subsequent - * tests in this thread will see the wakeup having taken place. - * - * The spin_unlock() itself is semi-permeable and only protects - * one way (it only protects stuff inside the critical region and - * stops them from bleeding out - it would still allow subsequent - * loads to move into the critical region). - */ -void -prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue(q, wait); - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(prepare_to_wait); - -void -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue_tail(q, wait); - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(prepare_to_wait_exclusive); - -/* - * finish_wait - clean up after waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - */ -void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - /* - * We can check for list emptiness outside the lock - * IFF: - * - we use the "careful" check that verifies both - * the next and prev pointers, so that there cannot - * be any half-pending updates in progress on other - * CPU's that we haven't seen yet (and that might - * still change the stack area. - * and - * - all other users take the lock (ie we can only - * have _one_ other CPU that looks at or modifies - * the list). - */ - if (!list_empty_careful(&wait->task_list)) { - spin_lock_irqsave(&q->lock, flags); - list_del_init(&wait->task_list); - spin_unlock_irqrestore(&q->lock, flags); - } -} -EXPORT_SYMBOL(finish_wait); - -/* - * abort_exclusive_wait - abort exclusive waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * @state: runstate of the waiter to be woken - * @key: key to identify a wait bit queue or %NULL - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - * - * Wakes up the next waiter if the caller is concurrently - * woken up through the queue. - * - * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and noone wakes up - * the next waiter. - */ -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, - unsigned int mode, void *key) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - spin_lock_irqsave(&q->lock, flags); - if (!list_empty(&wait->task_list)) - list_del_init(&wait->task_list); - else if (waitqueue_active(q)) - __wake_up_common(q, mode, 1, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(abort_exclusive_wait); - -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) -{ - int ret = default_wake_function(wait, mode, sync, key); - - if (ret) - list_del_init(&wait->task_list); - return ret; -} -EXPORT_SYMBOL(autoremove_wake_function); - -int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - test_bit(key->bit_nr, key->flags)) - return 0; - else - return autoremove_wake_function(wait, mode, sync, key); -} -EXPORT_SYMBOL(wake_bit_function); - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) - * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are - * permitted return codes. Nonzero return codes halt waiting and return. - */ -int __sched -__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) -{ - int ret = 0; - - do { - prepare_to_wait(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(q->key.flags); - } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq, &q->wait); - return ret; -} -EXPORT_SYMBOL(__wait_on_bit); - -int __sched out_of_line_wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit); - -int __sched -__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) -{ - do { - int ret; - - prepare_to_wait_exclusive(wq, &q->wait, mode); - if (!test_bit(q->key.bit_nr, q->key.flags)) - continue; - ret = action(q->key.flags); - if (!ret) - continue; - abort_exclusive_wait(wq, &q->wait, mode, &q->key); - return ret; - } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); - finish_wait(wq, &q->wait); - return 0; -} -EXPORT_SYMBOL(__wait_on_bit_lock); - -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit_lock(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); - -void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) -{ -#ifndef DDE_LINUX - struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, 1, &key); -#else - WARN_UNIMPL; -#endif -} -EXPORT_SYMBOL(__wake_up_bit); - -/** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. - * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_clear_bit(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. - */ -void wake_up_bit(void *word, int bit) -{ - __wake_up_bit(bit_waitqueue(word, bit), word, bit); -} -EXPORT_SYMBOL(wake_up_bit); - -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ -#ifndef DDE_LINUX - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -#else - WARN_UNIMPL; - return NULL; -#endif -} -EXPORT_SYMBOL(bit_waitqueue); diff --git a/libdde_linux26/lib/src/kernel/.svn/text-base/workqueue.c.svn-base b/libdde_linux26/lib/src/kernel/.svn/text-base/workqueue.c.svn-base deleted file mode 100644 index 5ad26d9f..00000000 --- a/libdde_linux26/lib/src/kernel/.svn/text-base/workqueue.c.svn-base +++ /dev/null @@ -1,1038 +0,0 @@ -/* - * linux/kernel/workqueue.c - * - * Generic mechanism for defining kernel helper threads for running - * arbitrary tasks in process context. - * - * Started by Ingo Molnar, Copyright (C) 2002 - * - * Derived from the taskqueue/keventd code by: - * - * David Woodhouse <dwmw2@infradead.org> - * Andrew Morton - * Kai Petzke <wpp@marie.physik.tu-berlin.de> - * Theodore Ts'o <tytso@mit.edu> - * - * Made to use alloc_percpu by Christoph Lameter. - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/signal.h> -#include <linux/completion.h> -#include <linux/workqueue.h> -#include <linux/slab.h> -#include <linux/cpu.h> -#include <linux/notifier.h> -#include <linux/kthread.h> -#include <linux/hardirq.h> -#include <linux/mempolicy.h> -#include <linux/freezer.h> -#include <linux/kallsyms.h> -#include <linux/debug_locks.h> -#include <linux/lockdep.h> - -#ifdef DDE_LINUX -#include "local.h" -#endif - -/* - * The per-CPU workqueue (if single thread, we always use the first - * possible cpu). - */ -struct cpu_workqueue_struct { - - spinlock_t lock; - - struct list_head worklist; - wait_queue_head_t more_work; - struct work_struct *current_work; - - struct workqueue_struct *wq; - struct task_struct *thread; - - int run_depth; /* Detect run_workqueue() recursion depth */ -} ____cacheline_aligned; - -/* - * The externally visible workqueue abstraction is an array of - * per-CPU workqueues: - */ -struct workqueue_struct { - struct cpu_workqueue_struct *cpu_wq; - struct list_head list; - const char *name; - int singlethread; - int freezeable; /* Freeze threads during suspend */ - int rt; -#ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; -#endif -}; - -/* Serializes the accesses to the list of workqueues. */ -static DEFINE_SPINLOCK(workqueue_lock); -static LIST_HEAD(workqueues); - -static int singlethread_cpu __read_mostly; -static const struct cpumask *cpu_singlethread_map __read_mostly; -/* - * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD - * flushes cwq->worklist. This means that flush_workqueue/wait_on_work - * which comes in between can't use for_each_online_cpu(). We could - * use cpu_possible_map, the cpumask below is more a documentation - * than optimization. - */ -static cpumask_var_t cpu_populated_map __read_mostly; - -/* If it's single threaded, it isn't in the list of workqueues. */ -static inline int is_wq_single_threaded(struct workqueue_struct *wq) -{ - return wq->singlethread; -} - -static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) -{ - return is_wq_single_threaded(wq) - ? cpu_singlethread_map : cpu_populated_map; -} - -static -struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) -{ - if (unlikely(is_wq_single_threaded(wq))) - cpu = singlethread_cpu; - return per_cpu_ptr(wq->cpu_wq, cpu); -} - -/* - * Set the workqueue on which a work item is to be run - * - Must *only* be called if the pending flag is set - */ -static inline void set_wq_data(struct work_struct *work, - struct cpu_workqueue_struct *cwq) -{ - unsigned long new; - - BUG_ON(!work_pending(work)); - - new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); - new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); - atomic_long_set(&work->data, new); -} - -static inline -struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) -{ - return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); -} - -static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, struct list_head *head) -{ - set_wq_data(work, cwq); - /* - * Ensure that we get the right work->data if we see the - * result of list_add() below, see try_to_grab_pending(). - */ - smp_wmb(); - list_add_tail(&work->entry, head); - wake_up(&cwq->more_work); -} - -static void __queue_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work) -{ - unsigned long flags; - - spin_lock_irqsave(&cwq->lock, flags); - insert_work(cwq, work, &cwq->worklist); - spin_unlock_irqrestore(&cwq->lock, flags); -} - -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. - */ -int queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ - int ret; - - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); - - return ret; -} -EXPORT_SYMBOL_GPL(queue_work); - -/** - * queue_work_on - queue work on specific cpu - * @cpu: CPU number to execute work on - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to a specific CPU, the caller must ensure it - * can't go away. - */ -int -queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) -{ - int ret = 0; - - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(!list_empty(&work->entry)); - __queue_work(wq_per_cpu(wq, cpu), work); - ret = 1; - } - return ret; -} -EXPORT_SYMBOL_GPL(queue_work_on); - -static void delayed_work_timer_fn(unsigned long __data) -{ - struct delayed_work *dwork = (struct delayed_work *)__data; - struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); - struct workqueue_struct *wq = cwq->wq; - - __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); -} - -/** - * queue_delayed_work - queue work on a workqueue after delay - * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - */ -int queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) -{ - if (delay == 0) - return queue_work(wq, &dwork->work); - - return queue_delayed_work_on(-1, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(queue_delayed_work); - -/** - * queue_delayed_work_on - queue work on specific CPU after delay - * @cpu: CPU number to execute work on - * @wq: workqueue to use - * @dwork: work to queue - * @delay: number of jiffies to wait before queueing - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - */ -int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) -{ - int ret = 0; - struct timer_list *timer = &dwork->timer; - struct work_struct *work = &dwork->work; - - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); - - timer_stats_timer_set_start_info(&dwork->timer); - - /* This stores cwq for the moment, for the timer_fn */ - set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); - timer->expires = jiffies + delay; - timer->data = (unsigned long)dwork; - timer->function = delayed_work_timer_fn; - - if (unlikely(cpu >= 0)) - add_timer_on(timer, cpu); - else - add_timer(timer); - ret = 1; - } - return ret; -} -EXPORT_SYMBOL_GPL(queue_delayed_work_on); - -static void run_workqueue(struct cpu_workqueue_struct *cwq) -{ - spin_lock_irq(&cwq->lock); - cwq->run_depth++; - if (cwq->run_depth > 3) { - /* morton gets to eat his hat */ - printk("%s: recursion depth exceeded: %d\n", - __func__, cwq->run_depth); - dump_stack(); - } - while (!list_empty(&cwq->worklist)) { - struct work_struct *work = list_entry(cwq->worklist.next, - struct work_struct, entry); - work_func_t f = work->func; -#ifdef CONFIG_LOCKDEP - /* - * It is permissible to free the struct work_struct - * from inside the function that is called from it, - * this we need to take into account for lockdep too. - * To avoid bogus "held lock freed" warnings as well - * as problems when looking into work->lockdep_map, - * make a copy and use that here. - */ - struct lockdep_map lockdep_map = work->lockdep_map; -#endif - - cwq->current_work = work; - list_del_init(cwq->worklist.next); - spin_unlock_irq(&cwq->lock); - - BUG_ON(get_wq_data(work) != cwq); - work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_acquire(&lockdep_map); - f(work); - lock_map_release(&lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), - task_pid_nr(current)); -#ifndef DDE_LINUX - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); - debug_show_held_locks(current); - dump_stack(); -#endif /* DDE_LINUX */ - } - - spin_lock_irq(&cwq->lock); - cwq->current_work = NULL; - } - cwq->run_depth--; - spin_unlock_irq(&cwq->lock); -} - -static int worker_thread(void *__cwq) -{ - struct cpu_workqueue_struct *cwq = __cwq; - DEFINE_WAIT(wait); - - if (cwq->wq->freezeable) - set_freezable(); - - set_user_nice(current, -5); - - for (;;) { - prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); - if (!freezing(current) && - !kthread_should_stop() && - list_empty(&cwq->worklist)) - schedule(); - finish_wait(&cwq->more_work, &wait); - - try_to_freeze(); - - if (kthread_should_stop()) - break; - - run_workqueue(cwq); - } - - return 0; -} - -struct wq_barrier { - struct work_struct work; - struct completion done; -}; - -static void wq_barrier_func(struct work_struct *work) -{ - struct wq_barrier *barr = container_of(work, struct wq_barrier, work); - complete(&barr->done); -} - -static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, - struct wq_barrier *barr, struct list_head *head) -{ - INIT_WORK(&barr->work, wq_barrier_func); - __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); - - init_completion(&barr->done); - - insert_work(cwq, &barr->work, head); -} - -static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) -{ - int active; - - if (cwq->thread == current) { - /* - * Probably keventd trying to flush its own queue. So simply run - * it by hand rather than deadlocking. - */ - run_workqueue(cwq); - active = 1; - } else { - struct wq_barrier barr; - - active = 0; - spin_lock_irq(&cwq->lock); - if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { - insert_wq_barrier(cwq, &barr, &cwq->worklist); - active = 1; - } - spin_unlock_irq(&cwq->lock); - - if (active) - wait_for_completion(&barr.done); - } - - return active; -} - -/** - * flush_workqueue - ensure that any scheduled work has run to completion. - * @wq: workqueue to flush - * - * Forces execution of the workqueue and blocks until its completion. - * This is typically used in driver shutdown handlers. - * - * We sleep until all works which were queued on entry have been handled, - * but we are not livelocked by new incoming ones. - * - * This function used to run the workqueues itself. Now we just wait for the - * helper threads to do it. - */ -void flush_workqueue(struct workqueue_struct *wq) -{ - const struct cpumask *cpu_map = wq_cpu_map(wq); - int cpu; - - might_sleep(); - lock_map_acquire(&wq->lockdep_map); - lock_map_release(&wq->lockdep_map); - for_each_cpu_mask_nr(cpu, *cpu_map) - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); -} -EXPORT_SYMBOL_GPL(flush_workqueue); - -/** - * flush_work - block until a work_struct's callback has terminated - * @work: the work which is to be flushed - * - * Returns false if @work has already terminated. - * - * It is expected that, prior to calling flush_work(), the caller has - * arranged for the work to not be requeued, otherwise it doesn't make - * sense to use this function. - */ -int flush_work(struct work_struct *work) -{ - struct cpu_workqueue_struct *cwq; - struct list_head *prev; - struct wq_barrier barr; - - might_sleep(); - cwq = get_wq_data(work); - if (!cwq) - return 0; - - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - prev = NULL; - spin_lock_irq(&cwq->lock); - if (!list_empty(&work->entry)) { - /* - * See the comment near try_to_grab_pending()->smp_rmb(). - * If it was re-queued under us we are not going to wait. - */ - smp_rmb(); - if (unlikely(cwq != get_wq_data(work))) - goto out; - prev = &work->entry; - } else { - if (cwq->current_work != work) - goto out; - prev = &cwq->worklist; - } - insert_wq_barrier(cwq, &barr, prev->next); -out: - spin_unlock_irq(&cwq->lock); - if (!prev) - return 0; - - wait_for_completion(&barr.done); - return 1; -} -EXPORT_SYMBOL_GPL(flush_work); - -/* - * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, - * so this work can't be re-armed in any way. - */ -static int try_to_grab_pending(struct work_struct *work) -{ - struct cpu_workqueue_struct *cwq; - int ret = -1; - - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) - return 0; - - /* - * The queueing is in progress, or it is already queued. Try to - * steal it from ->worklist without clearing WORK_STRUCT_PENDING. - */ - - cwq = get_wq_data(work); - if (!cwq) - return ret; - - spin_lock_irq(&cwq->lock); - if (!list_empty(&work->entry)) { - /* - * This work is queued, but perhaps we locked the wrong cwq. - * In that case we must see the new value after rmb(), see - * insert_work()->wmb(). - */ - smp_rmb(); - if (cwq == get_wq_data(work)) { - list_del_init(&work->entry); - ret = 1; - } - } - spin_unlock_irq(&cwq->lock); - - return ret; -} - -static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work) -{ - struct wq_barrier barr; - int running = 0; - - spin_lock_irq(&cwq->lock); - if (unlikely(cwq->current_work == work)) { - insert_wq_barrier(cwq, &barr, cwq->worklist.next); - running = 1; - } - spin_unlock_irq(&cwq->lock); - - if (unlikely(running)) - wait_for_completion(&barr.done); -} - -static void wait_on_work(struct work_struct *work) -{ - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - const struct cpumask *cpu_map; - int cpu; - - might_sleep(); - - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - - cwq = get_wq_data(work); - if (!cwq) - return; - - wq = cwq->wq; - cpu_map = wq_cpu_map(wq); - - for_each_cpu_mask_nr(cpu, *cpu_map) - wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); -} - -static int __cancel_work_timer(struct work_struct *work, - struct timer_list* timer) -{ - int ret; - - do { - ret = (timer && likely(del_timer(timer))); - if (!ret) - ret = try_to_grab_pending(work); - wait_on_work(work); - } while (unlikely(ret < 0)); - - work_clear_pending(work); - return ret; -} - -/** - * cancel_work_sync - block until a work_struct's callback has terminated - * @work: the work which is to be flushed - * - * Returns true if @work was pending. - * - * cancel_work_sync() will cancel the work if it is queued. If the work's - * callback appears to be running, cancel_work_sync() will block until it - * has completed. - * - * It is possible to use this function if the work re-queues itself. It can - * cancel the work even if it migrates to another workqueue, however in that - * case it only guarantees that work->func() has completed on the last queued - * workqueue. - * - * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not - * pending, otherwise it goes into a busy-wait loop until the timer expires. - * - * The caller must ensure that workqueue_struct on which this work was last - * queued can't be destroyed before this function returns. - */ -int cancel_work_sync(struct work_struct *work) -{ - return __cancel_work_timer(work, NULL); -} -EXPORT_SYMBOL_GPL(cancel_work_sync); - -/** - * cancel_delayed_work_sync - reliably kill off a delayed work. - * @dwork: the delayed work struct - * - * Returns true if @dwork was pending. - * - * It is possible to use this function if @dwork rearms itself via queue_work() - * or queue_delayed_work(). See also the comment for cancel_work_sync(). - */ -int cancel_delayed_work_sync(struct delayed_work *dwork) -{ - return __cancel_work_timer(&dwork->work, &dwork->timer); -} -EXPORT_SYMBOL(cancel_delayed_work_sync); - -static struct workqueue_struct *keventd_wq __read_mostly; - -/** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * This puts a job in the kernel-global workqueue. - */ -int schedule_work(struct work_struct *work) -{ - return queue_work(keventd_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/* - * schedule_work_on - put work task on a specific cpu - * @cpu: cpu to put the work task on - * @work: job to be done - * - * This puts a job on a specific cpu - */ -int schedule_work_on(int cpu, struct work_struct *work) -{ - return queue_work_on(cpu, keventd_wq, work); -} -EXPORT_SYMBOL(schedule_work_on); - -/** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. - */ -int schedule_delayed_work(struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work(keventd_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work); - -/** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay - * @cpu: cpu to use - * @dwork: job to be done - * @delay: number of jiffies to wait - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue on the specified CPU. - */ -int schedule_delayed_work_on(int cpu, - struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work_on); - -/** - * schedule_on_each_cpu - call a function on each online CPU from keventd - * @func: the function to call - * - * Returns zero on success. - * Returns -ve errno on failure. - * - * schedule_on_each_cpu() is very slow. - */ -int schedule_on_each_cpu(work_func_t func) -{ - int cpu; - struct work_struct *works; - - works = alloc_percpu(struct work_struct); - if (!works) - return -ENOMEM; - - get_online_cpus(); - for_each_online_cpu(cpu) { - struct work_struct *work = per_cpu_ptr(works, cpu); - - INIT_WORK(work, func); - schedule_work_on(cpu, work); - } - for_each_online_cpu(cpu) - flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); - free_percpu(works); - return 0; -} - -void flush_scheduled_work(void) -{ - flush_workqueue(keventd_wq); -} -EXPORT_SYMBOL(flush_scheduled_work); - -/** - * execute_in_process_context - reliably execute the routine with user context - * @fn: the function to execute - * @ew: guaranteed storage for the execute work structure (must - * be available when the work executes) - * - * Executes the function immediately if process context is available, - * otherwise schedules the function for delayed execution. - * - * Returns: 0 - function was executed - * 1 - function was scheduled for execution - */ -int execute_in_process_context(work_func_t fn, struct execute_work *ew) -{ - if (!in_interrupt()) { - fn(&ew->work); - return 0; - } - - INIT_WORK(&ew->work, fn); - schedule_work(&ew->work); - - return 1; -} -EXPORT_SYMBOL_GPL(execute_in_process_context); - -int keventd_up(void) -{ - return keventd_wq != NULL; -} - -int current_is_keventd(void) -{ - struct cpu_workqueue_struct *cwq; - int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ - int ret = 0; - - BUG_ON(!keventd_wq); - - cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); - if (current == cwq->thread) - ret = 1; - - return ret; - -} - -static struct cpu_workqueue_struct * -init_cpu_workqueue(struct workqueue_struct *wq, int cpu) -{ - struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - cwq->wq = wq; - spin_lock_init(&cwq->lock); - INIT_LIST_HEAD(&cwq->worklist); - init_waitqueue_head(&cwq->more_work); - - return cwq; -} - -static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - struct workqueue_struct *wq = cwq->wq; - const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; - struct task_struct *p; - - p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); - /* - * Nobody can add the work_struct to this cwq, - * if (caller is __create_workqueue) - * nobody should see this wq - * else // caller is CPU_UP_PREPARE - * cpu is not on cpu_online_map - * so we can abort safely. - */ - if (IS_ERR(p)) - return PTR_ERR(p); - if (cwq->wq->rt) - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - cwq->thread = p; - - return 0; -} - -static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) -{ - struct task_struct *p = cwq->thread; - - if (p != NULL) { - if (cpu >= 0) - kthread_bind(p, cpu); - wake_up_process(p); - } -} - -struct workqueue_struct *__create_workqueue_key(const char *name, - int singlethread, - int freezeable, - int rt, - struct lock_class_key *key, - const char *lock_name) -{ - struct workqueue_struct *wq; - struct cpu_workqueue_struct *cwq; - int err = 0, cpu; - - wq = kzalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) - return NULL; - - wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); - if (!wq->cpu_wq) { - kfree(wq); - return NULL; - } - - wq->name = name; - lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); - wq->singlethread = singlethread; - wq->freezeable = freezeable; - wq->rt = rt; - INIT_LIST_HEAD(&wq->list); - - if (singlethread) { - cwq = init_cpu_workqueue(wq, singlethread_cpu); - err = create_workqueue_thread(cwq, singlethread_cpu); - start_workqueue_thread(cwq, -1); - } else { - cpu_maps_update_begin(); - /* - * We must place this wq on list even if the code below fails. - * cpu_down(cpu) can remove cpu from cpu_populated_map before - * destroy_workqueue() takes the lock, in that case we leak - * cwq[cpu]->thread. - */ - spin_lock(&workqueue_lock); - list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); - /* - * We must initialize cwqs for each possible cpu even if we - * are going to call destroy_workqueue() finally. Otherwise - * cpu_up() can hit the uninitialized cwq once we drop the - * lock. - */ - for_each_possible_cpu(cpu) { - cwq = init_cpu_workqueue(wq, cpu); - if (err || !cpu_online(cpu)) - continue; - err = create_workqueue_thread(cwq, cpu); - start_workqueue_thread(cwq, cpu); - } - cpu_maps_update_done(); - } - - if (err) { - destroy_workqueue(wq); - wq = NULL; - } - return wq; -} -EXPORT_SYMBOL_GPL(__create_workqueue_key); - -static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) -{ - /* - * Our caller is either destroy_workqueue() or CPU_POST_DEAD, - * cpu_add_remove_lock protects cwq->thread. - */ - if (cwq->thread == NULL) - return; - - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - flush_cpu_workqueue(cwq); - /* - * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, - * a concurrent flush_workqueue() can insert a barrier after us. - * However, in that case run_workqueue() won't return and check - * kthread_should_stop() until it flushes all work_struct's. - * When ->worklist becomes empty it is safe to exit because no - * more work_structs can be queued on this cwq: flush_workqueue - * checks list_empty(), and a "normal" queue_work() can't use - * a dead CPU. - */ - kthread_stop(cwq->thread); - cwq->thread = NULL; -} - -/** - * destroy_workqueue - safely terminate a workqueue - * @wq: target workqueue - * - * Safely destroy a workqueue. All work currently pending will be done first. - */ -void destroy_workqueue(struct workqueue_struct *wq) -{ - const struct cpumask *cpu_map = wq_cpu_map(wq); - int cpu; - - cpu_maps_update_begin(); - spin_lock(&workqueue_lock); - list_del(&wq->list); - spin_unlock(&workqueue_lock); - - for_each_cpu_mask_nr(cpu, *cpu_map) - cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); - cpu_maps_update_done(); - - free_percpu(wq->cpu_wq); - kfree(wq); -} -EXPORT_SYMBOL_GPL(destroy_workqueue); - -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - int ret = NOTIFY_OK; - - action &= ~CPU_TASKS_FROZEN; - - switch (action) { - case CPU_UP_PREPARE: - cpumask_set_cpu(cpu, cpu_populated_map); - } -undo: - list_for_each_entry(wq, &workqueues, list) { - cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - switch (action) { - case CPU_UP_PREPARE: - if (!create_workqueue_thread(cwq, cpu)) - break; - printk(KERN_ERR "workqueue [%s] for %i failed\n", - wq->name, cpu); - action = CPU_UP_CANCELED; - ret = NOTIFY_BAD; - goto undo; - - case CPU_ONLINE: - start_workqueue_thread(cwq, cpu); - break; - - case CPU_UP_CANCELED: - start_workqueue_thread(cwq, -1); - case CPU_POST_DEAD: - cleanup_workqueue_thread(cwq); - break; - } - } - - switch (action) { - case CPU_UP_CANCELED: - case CPU_POST_DEAD: - cpumask_clear_cpu(cpu, cpu_populated_map); - } - - return ret; -} - -#ifdef CONFIG_SMP -static struct workqueue_struct *work_on_cpu_wq __read_mostly; - -struct work_for_cpu { - struct work_struct work; - long (*fn)(void *); - void *arg; - long ret; -}; - -static void do_work_for_cpu(struct work_struct *w) -{ - struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work); - - wfc->ret = wfc->fn(wfc->arg); -} - -/** - * work_on_cpu - run a function in user context on a particular cpu - * @cpu: the cpu to run on - * @fn: the function to run - * @arg: the function arg - * - * This will return the value @fn returns. - * It is up to the caller to ensure that the cpu doesn't go offline. - */ -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) -{ - struct work_for_cpu wfc; - - INIT_WORK(&wfc.work, do_work_for_cpu); - wfc.fn = fn; - wfc.arg = arg; - queue_work_on(cpu, work_on_cpu_wq, &wfc.work); - flush_work(&wfc.work); - - return wfc.ret; -} -EXPORT_SYMBOL_GPL(work_on_cpu); -#endif /* CONFIG_SMP */ - -void __init init_workqueues(void) -{ - alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); - - cpumask_copy(cpu_populated_map, cpu_online_mask); - singlethread_cpu = cpumask_first(cpu_possible_mask); - cpu_singlethread_map = cpumask_of(singlethread_cpu); - hotcpu_notifier(workqueue_cpu_callback, 0); - keventd_wq = create_workqueue("events"); - BUG_ON(!keventd_wq); -#ifdef CONFIG_SMP - work_on_cpu_wq = create_workqueue("work_on_cpu"); - BUG_ON(!work_on_cpu_wq); -#endif -} - -#ifdef DDE_LINUX -core_initcall(init_workqueues); -#endif diff --git a/libdde_linux26/lib/src/lib/.svn/all-wcprops b/libdde_linux26/lib/src/lib/.svn/all-wcprops deleted file mode 100644 index 17468f82..00000000 --- a/libdde_linux26/lib/src/lib/.svn/all-wcprops +++ /dev/null @@ -1,11 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 62 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/lib -END -iomap.c -K 25 -svn:wc:ra_dav:version-url -V 70 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/lib/iomap.c -END diff --git a/libdde_linux26/lib/src/lib/.svn/entries b/libdde_linux26/lib/src/lib/.svn/entries deleted file mode 100644 index 25729e67..00000000 --- a/libdde_linux26/lib/src/lib/.svn/entries +++ /dev/null @@ -1,62 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/lib -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -iomap.c -file - - - - -2009-11-15T17:17:07.000000Z -961240a2153dd6c28fab535b25f31378 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -8009 - diff --git a/libdde_linux26/lib/src/lib/.svn/format b/libdde_linux26/lib/src/lib/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/lib/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/lib/.svn/text-base/iomap.c.svn-base b/libdde_linux26/lib/src/lib/.svn/text-base/iomap.c.svn-base deleted file mode 100644 index d90ac2aa..00000000 --- a/libdde_linux26/lib/src/lib/.svn/text-base/iomap.c.svn-base +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Implement the default iomap interfaces - * - * (C) Copyright 2004 Linus Torvalds - */ -#include <linux/pci.h> -#include <linux/io.h> - -#include <linux/module.h> - -#ifdef DDE_LINUX -#include "local.h" -#endif - -/* - * Read/write from/to an (offsettable) iomem cookie. It might be a PIO - * access or a MMIO access, these functions don't care. The info is - * encoded in the hardware mapping set up by the mapping functions - * (or the cookie itself, depending on implementation and hw). - * - * The generic routines don't assume any hardware mappings, and just - * encode the PIO/MMIO as part of the cookie. They coldly assume that - * the MMIO IO mappings are not in the low address range. - * - * Architectures for which this is not true can't use this generic - * implementation and should do their own copy. - */ - -#ifndef HAVE_ARCH_PIO_SIZE -/* - * We encode the physical PIO addresses (0-0xffff) into the - * pointer by offsetting them with a constant (0x10000) and - * assuming that all the low addresses are always PIO. That means - * we can do some sanity checks on the low bits, and don't - * need to just take things for granted. - */ -#define PIO_OFFSET 0x10000UL -#define PIO_MASK 0x0ffffUL -#define PIO_RESERVED 0x40000UL -#endif - -static void bad_io_access(unsigned long port, const char *access) -{ - static int count = 10; - if (count) { - count--; - WARN(1, KERN_ERR "Bad IO access at port %#lx (%s)\n", port, access); - } -} - -/* - * Ugly macros are a way of life. - */ -#ifdef DDE_LINUX -/* DDE_LINUX maps io ports to [0xf0000000, 0xf0040000], so we also need - * to check the lower bounds of port addresses. - */ -#define IO_COND(addr, is_pio, is_mmio) do { \ - unsigned long port = (unsigned long __force)addr; \ - if (port > PIO_OFFSET && port < PIO_RESERVED) { \ - port &= PIO_MASK; \ - is_pio; \ - } else { \ - is_mmio; \ - } \ -} while (0) -#else -#define IO_COND(addr, is_pio, is_mmio) do { \ - unsigned long port = (unsigned long __force)addr; \ - if (port >= PIO_RESERVED) { \ - is_mmio; \ - } else if (port > PIO_OFFSET) { \ - port &= PIO_MASK; \ - is_pio; \ - } else \ - bad_io_access(port, #is_pio ); \ -} while (0) -#endif - -#ifndef pio_read16be -#define pio_read16be(port) swab16(inw(port)) -#define pio_read32be(port) swab32(inl(port)) -#endif - -#ifndef mmio_read16be -#define mmio_read16be(addr) be16_to_cpu(__raw_readw(addr)) -#define mmio_read32be(addr) be32_to_cpu(__raw_readl(addr)) -#endif - -unsigned int ioread8(void __iomem *addr) -{ - IO_COND(addr, return inb(port), return readb(addr)); - return 0xff; -} -unsigned int ioread16(void __iomem *addr) -{ - IO_COND(addr, return inw(port), return readw(addr)); - return 0xffff; -} -unsigned int ioread16be(void __iomem *addr) -{ - IO_COND(addr, return pio_read16be(port), return mmio_read16be(addr)); - return 0xffff; -} -unsigned int ioread32(void __iomem *addr) -{ - IO_COND(addr, return inl(port), return readl(addr)); - return 0xffffffff; -} -unsigned int ioread32be(void __iomem *addr) -{ - IO_COND(addr, return pio_read32be(port), return mmio_read32be(addr)); - return 0xffffffff; -} -EXPORT_SYMBOL(ioread8); -EXPORT_SYMBOL(ioread16); -EXPORT_SYMBOL(ioread16be); -EXPORT_SYMBOL(ioread32); -EXPORT_SYMBOL(ioread32be); - -#ifndef pio_write16be -#define pio_write16be(val,port) outw(swab16(val),port) -#define pio_write32be(val,port) outl(swab32(val),port) -#endif - -#ifndef mmio_write16be -#define mmio_write16be(val,port) __raw_writew(be16_to_cpu(val),port) -#define mmio_write32be(val,port) __raw_writel(be32_to_cpu(val),port) -#endif - -void iowrite8(u8 val, void __iomem *addr) -{ - IO_COND(addr, outb(val,port), writeb(val, addr)); -} -void iowrite16(u16 val, void __iomem *addr) -{ - IO_COND(addr, outw(val,port), writew(val, addr)); -} -void iowrite16be(u16 val, void __iomem *addr) -{ - IO_COND(addr, pio_write16be(val,port), mmio_write16be(val, addr)); -} -void iowrite32(u32 val, void __iomem *addr) -{ - IO_COND(addr, outl(val,port), writel(val, addr)); -} -void iowrite32be(u32 val, void __iomem *addr) -{ - IO_COND(addr, pio_write32be(val,port), mmio_write32be(val, addr)); -} -EXPORT_SYMBOL(iowrite8); -EXPORT_SYMBOL(iowrite16); -EXPORT_SYMBOL(iowrite16be); -EXPORT_SYMBOL(iowrite32); -EXPORT_SYMBOL(iowrite32be); - -/* - * These are the "repeat MMIO read/write" functions. - * Note the "__raw" accesses, since we don't want to - * convert to CPU byte order. We write in "IO byte - * order" (we also don't have IO barriers). - */ -#ifndef mmio_insb -static inline void mmio_insb(void __iomem *addr, u8 *dst, int count) -{ - while (--count >= 0) { - u8 data = __raw_readb(addr); - *dst = data; - dst++; - } -} -static inline void mmio_insw(void __iomem *addr, u16 *dst, int count) -{ - while (--count >= 0) { - u16 data = __raw_readw(addr); - *dst = data; - dst++; - } -} -static inline void mmio_insl(void __iomem *addr, u32 *dst, int count) -{ - while (--count >= 0) { - u32 data = __raw_readl(addr); - *dst = data; - dst++; - } -} -#endif - -#ifndef mmio_outsb -static inline void mmio_outsb(void __iomem *addr, const u8 *src, int count) -{ - while (--count >= 0) { - __raw_writeb(*src, addr); - src++; - } -} -static inline void mmio_outsw(void __iomem *addr, const u16 *src, int count) -{ - while (--count >= 0) { - __raw_writew(*src, addr); - src++; - } -} -static inline void mmio_outsl(void __iomem *addr, const u32 *src, int count) -{ - while (--count >= 0) { - __raw_writel(*src, addr); - src++; - } -} -#endif - -void ioread8_rep(void __iomem *addr, void *dst, unsigned long count) -{ - IO_COND(addr, insb(port,dst,count), mmio_insb(addr, dst, count)); -} -void ioread16_rep(void __iomem *addr, void *dst, unsigned long count) -{ - IO_COND(addr, insw(port,dst,count), mmio_insw(addr, dst, count)); -} -void ioread32_rep(void __iomem *addr, void *dst, unsigned long count) -{ - IO_COND(addr, insl(port,dst,count), mmio_insl(addr, dst, count)); -} -EXPORT_SYMBOL(ioread8_rep); -EXPORT_SYMBOL(ioread16_rep); -EXPORT_SYMBOL(ioread32_rep); - -void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) -{ - IO_COND(addr, outsb(port, src, count), mmio_outsb(addr, src, count)); -} -void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) -{ - IO_COND(addr, outsw(port, src, count), mmio_outsw(addr, src, count)); -} -void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) -{ - IO_COND(addr, outsl(port, src,count), mmio_outsl(addr, src, count)); -} -EXPORT_SYMBOL(iowrite8_rep); -EXPORT_SYMBOL(iowrite16_rep); -EXPORT_SYMBOL(iowrite32_rep); - -/* Create a virtual mapping cookie for an IO port range */ -void __iomem *ioport_map(unsigned long port, unsigned int nr) -{ - if (port > PIO_MASK) - return NULL; - return (void __iomem *) (unsigned long) (port + PIO_OFFSET); -} - -void ioport_unmap(void __iomem *addr) -{ - /* Nothing to do */ -} -EXPORT_SYMBOL(ioport_map); -EXPORT_SYMBOL(ioport_unmap); - -/** - * pci_iomap - create a virtual mapping cookie for a PCI BAR - * @dev: PCI device that owns the BAR - * @bar: BAR number - * @maxlen: length of the memory to map - * - * Using this function you will get a __iomem address to your device BAR. - * You can access it using ioread*() and iowrite*(). These functions hide - * the details if this is a MMIO or PIO address space and will just do what - * you expect from them in the correct way. - * - * @maxlen specifies the maximum length to map. If you want to get access to - * the complete BAR without checking for its length first, pass %0 here. - * */ -void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) -{ - resource_size_t start = pci_resource_start(dev, bar); - resource_size_t len = pci_resource_len(dev, bar); - unsigned long flags = pci_resource_flags(dev, bar); - - if (!len || !start) - return NULL; - if (maxlen && len > maxlen) - len = maxlen; - if (flags & IORESOURCE_IO) - return ioport_map(start, len); - if (flags & IORESOURCE_MEM) { - if (flags & IORESOURCE_CACHEABLE) - return ioremap(start, len); - return ioremap_nocache(start, len); - } - /* What? */ - return NULL; -} - -void pci_iounmap(struct pci_dev *dev, void __iomem * addr) -{ - IO_COND(addr, /* nothing */, iounmap(addr)); -} -EXPORT_SYMBOL(pci_iomap); -EXPORT_SYMBOL(pci_iounmap); diff --git a/libdde_linux26/lib/src/mm/.svn/all-wcprops b/libdde_linux26/lib/src/mm/.svn/all-wcprops deleted file mode 100644 index 3d4115f7..00000000 --- a/libdde_linux26/lib/src/mm/.svn/all-wcprops +++ /dev/null @@ -1,17 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 61 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/mm -END -page-writeback.c -K 25 -svn:wc:ra_dav:version-url -V 78 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/mm/page-writeback.c -END -memory.c -K 25 -svn:wc:ra_dav:version-url -V 70 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/mm/memory.c -END diff --git a/libdde_linux26/lib/src/mm/.svn/entries b/libdde_linux26/lib/src/mm/.svn/entries deleted file mode 100644 index ec9bd239..00000000 --- a/libdde_linux26/lib/src/mm/.svn/entries +++ /dev/null @@ -1,96 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/mm -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -page-writeback.c -file - - - - -2009-11-15T17:17:11.000000Z -d99c926612eb64c2f3836de532c5bcba -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -40747 - -memory.c -file - - - - -2009-11-15T17:17:11.000000Z -aba936f07e9929520b9fcb9bcdf42c30 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -88290 - diff --git a/libdde_linux26/lib/src/mm/.svn/format b/libdde_linux26/lib/src/mm/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/mm/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/mm/.svn/text-base/memory.c.svn-base b/libdde_linux26/lib/src/mm/.svn/text-base/memory.c.svn-base deleted file mode 100644 index a4d66f50..00000000 --- a/libdde_linux26/lib/src/mm/.svn/text-base/memory.c.svn-base +++ /dev/null @@ -1,3203 +0,0 @@ -/* - * linux/mm/memory.c - * - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - */ - -/* - * demand-loading started 01.12.91 - seems it is high on the list of - * things wanted, and it should be easy to implement. - Linus - */ - -/* - * Ok, demand-loading was easy, shared pages a little bit tricker. Shared - * pages started 02.12.91, seems to work. - Linus. - * - * Tested sharing by executing about 30 /bin/sh: under the old kernel it - * would have taken more than the 6M I have free, but it worked well as - * far as I could see. - * - * Also corrected some "invalidate()"s - I wasn't doing enough of them. - */ - -/* - * Real VM (paging to/from disk) started 18.12.91. Much more work and - * thought has to go into this. Oh, well.. - * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. - * Found it. Everything seems to work now. - * 20.12.91 - Ok, making the swap-device changeable like the root. - */ - -/* - * 05.04.94 - Multi-page memory management added for v1.1. - * Idea by Alex Bligh (alex@cconcepts.co.uk) - * - * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG - * (Gerhard.Wichert@pdb.siemens.de) - * - * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) - */ - -#include <linux/kernel_stat.h> -#include <linux/mm.h> -#include <linux/hugetlb.h> -#include <linux/mman.h> -#include <linux/swap.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#ifndef DDE_LINUX -#include <linux/rmap.h> -#endif -#include <linux/module.h> -#include <linux/delayacct.h> -#include <linux/init.h> -#include <linux/writeback.h> -#include <linux/memcontrol.h> -#include <linux/mmu_notifier.h> -#include <linux/kallsyms.h> -#include <linux/swapops.h> -#include <linux/elf.h> - -#include <asm/pgalloc.h> -#include <asm/uaccess.h> -#include <asm/tlb.h> -#include <asm/tlbflush.h> -#include <asm/pgtable.h> - -#include "internal.h" - -#ifndef CONFIG_NEED_MULTIPLE_NODES -/* use the per-pgdat data instead for discontigmem - mbligh */ -unsigned long max_mapnr; -#ifndef DDE_LINUX -struct page *mem_map; -#endif - -EXPORT_SYMBOL(max_mapnr); -#ifndef DDE_LINUX -EXPORT_SYMBOL(mem_map); -#endif -#endif - -unsigned long num_physpages; -/* - * A number of key systems in x86 including ioremap() rely on the assumption - * that high_memory defines the upper bound on direct map memory, then end - * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and - * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL - * and ZONE_HIGHMEM. - */ -void * high_memory; - -EXPORT_SYMBOL(num_physpages); -EXPORT_SYMBOL(high_memory); - -/* - * Randomize the address space (stacks, mmaps, brk, etc.). - * - * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, - * as ancient (libc5 based) binaries can segfault. ) - */ -int randomize_va_space __read_mostly = -#ifdef CONFIG_COMPAT_BRK - 1; -#else - 2; -#endif - -#ifndef DDE_LINUX -static int __init disable_randmaps(char *s) -{ - randomize_va_space = 0; - return 1; -} -__setup("norandmaps", disable_randmaps); - - -/* - * If a p?d_bad entry is found while walking page tables, report - * the error, before resetting entry to p?d_none. Usually (but - * very seldom) called out from the p?d_none_or_clear_bad macros. - */ - -void pgd_clear_bad(pgd_t *pgd) -{ - pgd_ERROR(*pgd); - pgd_clear(pgd); -} - -void pud_clear_bad(pud_t *pud) -{ - pud_ERROR(*pud); - pud_clear(pud); -} - -void pmd_clear_bad(pmd_t *pmd) -{ - pmd_ERROR(*pmd); - pmd_clear(pmd); -} - -/* - * Note: this doesn't free the actual pages themselves. That - * has been handled earlier when unmapping all the memory regions. - */ -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) -{ - pgtable_t token = pmd_pgtable(*pmd); - pmd_clear(pmd); - pte_free_tlb(tlb, token); - tlb->mm->nr_ptes--; -} - -static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pmd_t *pmd; - unsigned long next; - unsigned long start; - - start = addr; - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) - continue; - free_pte_range(tlb, pmd); - } while (pmd++, addr = next, addr != end); - - start &= PUD_MASK; - if (start < floor) - return; - if (ceiling) { - ceiling &= PUD_MASK; - if (!ceiling) - return; - } - if (end - 1 > ceiling - 1) - return; - - pmd = pmd_offset(pud, start); - pud_clear(pud); - pmd_free_tlb(tlb, pmd); -} - -static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pud_t *pud; - unsigned long next; - unsigned long start; - - start = addr; - pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) - continue; - free_pmd_range(tlb, pud, addr, next, floor, ceiling); - } while (pud++, addr = next, addr != end); - - start &= PGDIR_MASK; - if (start < floor) - return; - if (ceiling) { - ceiling &= PGDIR_MASK; - if (!ceiling) - return; - } - if (end - 1 > ceiling - 1) - return; - - pud = pud_offset(pgd, start); - pgd_clear(pgd); - pud_free_tlb(tlb, pud); -} - -/* - * This function frees user-level page tables of a process. - * - * Must be called with pagetable lock held. - */ -void free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ - pgd_t *pgd; - unsigned long next; - unsigned long start; - - /* - * The next few lines have given us lots of grief... - * - * Why are we testing PMD* at this top level? Because often - * there will be no work to do at all, and we'd prefer not to - * go all the way down to the bottom just to discover that. - * - * Why all these "- 1"s? Because 0 represents both the bottom - * of the address space and the top of it (using -1 for the - * top wouldn't help much: the masks would do the wrong thing). - * The rule is that addr 0 and floor 0 refer to the bottom of - * the address space, but end 0 and ceiling 0 refer to the top - * Comparisons need to use "end - 1" and "ceiling - 1" (though - * that end 0 case should be mythical). - * - * Wherever addr is brought up or ceiling brought down, we must - * be careful to reject "the opposite 0" before it confuses the - * subsequent tests. But what about where end is brought down - * by PMD_SIZE below? no, end can't go down to 0 there. - * - * Whereas we round start (addr) and ceiling down, by different - * masks at different levels, in order to test whether a table - * now has no other vmas using it, so can be freed, we don't - * bother to round floor or end up - the tests don't need that. - */ - - addr &= PMD_MASK; - if (addr < floor) { - addr += PMD_SIZE; - if (!addr) - return; - } - if (ceiling) { - ceiling &= PMD_MASK; - if (!ceiling) - return; - } - if (end - 1 > ceiling - 1) - end -= PMD_SIZE; - if (addr > end - 1) - return; - - start = addr; - pgd = pgd_offset(tlb->mm, addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - free_pud_range(tlb, pgd, addr, next, floor, ceiling); - } while (pgd++, addr = next, addr != end); -} - -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long floor, unsigned long ceiling) -{ - while (vma) { - struct vm_area_struct *next = vma->vm_next; - unsigned long addr = vma->vm_start; - - /* - * Hide vma from rmap and vmtruncate before freeing pgtables - */ - anon_vma_unlink(vma); - unlink_file_vma(vma); - - if (is_vm_hugetlb_page(vma)) { - hugetlb_free_pgd_range(tlb, addr, vma->vm_end, - floor, next? next->vm_start: ceiling); - } else { - /* - * Optimization: gather nearby vmas into one call down - */ - while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(next)) { - vma = next; - next = vma->vm_next; - anon_vma_unlink(vma); - unlink_file_vma(vma); - } - free_pgd_range(tlb, addr, vma->vm_end, - floor, next? next->vm_start: ceiling); - } - vma = next; - } -} - -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) -{ - pgtable_t new = pte_alloc_one(mm, address); - if (!new) - return -ENOMEM; - - /* - * Ensure all pte setup (eg. pte page lock and page clearing) are - * visible before the pte is made visible to other CPUs by being - * put into page tables. - * - * The other side of the story is the pointer chasing in the page - * table walking code (when walking the page table without locking; - * ie. most of the time). Fortunately, these data accesses consist - * of a chain of data-dependent loads, meaning most CPUs (alpha - * being the notable exception) will already guarantee loads are - * seen in-order. See the alpha page table accessors for the - * smp_read_barrier_depends() barriers in page table walking code. - */ - smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - - spin_lock(&mm->page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ - mm->nr_ptes++; - pmd_populate(mm, pmd, new); - new = NULL; - } - spin_unlock(&mm->page_table_lock); - if (new) - pte_free(mm, new); - return 0; -} - -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) -{ - pte_t *new = pte_alloc_one_kernel(&init_mm, address); - if (!new) - return -ENOMEM; - - smp_wmb(); /* See comment in __pte_alloc */ - - spin_lock(&init_mm.page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ - pmd_populate_kernel(&init_mm, pmd, new); - new = NULL; - } - spin_unlock(&init_mm.page_table_lock); - if (new) - pte_free_kernel(&init_mm, new); - return 0; -} - -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) -{ - if (file_rss) - add_mm_counter(mm, file_rss, file_rss); - if (anon_rss) - add_mm_counter(mm, anon_rss, anon_rss); -} - -/* - * This function is called to print an error when a bad pte - * is found. For example, we might have a PFN-mapped pte in - * a region that doesn't allow it. - * - * The calling function must still handle the error. - */ -static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, struct page *page) -{ - pgd_t *pgd = pgd_offset(vma->vm_mm, addr); - pud_t *pud = pud_offset(pgd, addr); - pmd_t *pmd = pmd_offset(pud, addr); - struct address_space *mapping; - pgoff_t index; - static unsigned long resume; - static unsigned long nr_shown; - static unsigned long nr_unshown; - - /* - * Allow a burst of 60 reports, then keep quiet for that minute; - * or allow a steady drip of one report per second. - */ - if (nr_shown == 60) { - if (time_before(jiffies, resume)) { - nr_unshown++; - return; - } - if (nr_unshown) { - printk(KERN_ALERT - "BUG: Bad page map: %lu messages suppressed\n", - nr_unshown); - nr_unshown = 0; - } - nr_shown = 0; - } - if (nr_shown++ == 0) - resume = jiffies + 60 * HZ; - - mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; - index = linear_page_index(vma, addr); - - printk(KERN_ALERT - "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); - if (page) { - printk(KERN_ALERT - "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", - page, (void *)page->flags, page_count(page), - page_mapcount(page), page->mapping, page->index); - } - printk(KERN_ALERT - "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", - (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); - /* - * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y - */ - if (vma->vm_ops) - print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", - (unsigned long)vma->vm_ops->fault); - if (vma->vm_file && vma->vm_file->f_op) - print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", - (unsigned long)vma->vm_file->f_op->mmap); - dump_stack(); - add_taint(TAINT_BAD_PAGE); -} - -static inline int is_cow_mapping(unsigned int flags) -{ - return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; -} - -/* - * vm_normal_page -- This function gets the "struct page" associated with a pte. - * - * "Special" mappings do not wish to be associated with a "struct page" (either - * it doesn't exist, or it exists but they don't want to touch it). In this - * case, NULL is returned here. "Normal" mappings do have a struct page. - * - * There are 2 broad cases. Firstly, an architecture may define a pte_special() - * pte bit, in which case this function is trivial. Secondly, an architecture - * may not have a spare pte bit, which requires a more complicated scheme, - * described below. - * - * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a - * special mapping (even if there are underlying and valid "struct pages"). - * COWed pages of a VM_PFNMAP are always normal. - * - * The way we recognize COWed pages within VM_PFNMAP mappings is through the - * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit - * set, and the vm_pgoff will point to the first PFN mapped: thus every special - * mapping will always honor the rule - * - * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) - * - * And for normal mappings this is false. - * - * This restricts such mappings to be a linear translation from virtual address - * to pfn. To get around this restriction, we allow arbitrary mappings so long - * as the vma is not a COW mapping; in that case, we know that all ptes are - * special (because none can have been COWed). - * - * - * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. - * - * VM_MIXEDMAP mappings can likewise contain memory with or without "struct - * page" backing, however the difference is that _all_ pages with a struct - * page (that is, those where pfn_valid is true) are refcounted and considered - * normal pages by the VM. The disadvantage is that pages are refcounted - * (which can be slower and simply not an option for some PFNMAP users). The - * advantage is that we don't have to follow the strict linearity rule of - * PFNMAP mappings in order to support COWable mappings. - * - */ -#ifdef __HAVE_ARCH_PTE_SPECIAL -# define HAVE_PTE_SPECIAL 1 -#else -# define HAVE_PTE_SPECIAL 0 -#endif -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) -{ - unsigned long pfn = pte_pfn(pte); - - if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte))) - goto check_pfn; - if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) - print_bad_pte(vma, addr, pte, NULL); - return NULL; - } - - /* !HAVE_PTE_SPECIAL case follows: */ - - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; - } - } - -check_pfn: - if (unlikely(pfn > highest_memmap_pfn)) { - print_bad_pte(vma, addr, pte, NULL); - return NULL; - } -#endif - - /* - * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. - */ -out: - return pfn_to_page(pfn); -} - -/* - * copy one vm_area from one task to the other. Assumes the page tables - * already present in the new task to be cleared in the whole range - * covered by this vma. - */ - -static inline void -copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) -{ - unsigned long vm_flags = vma->vm_flags; - pte_t pte = *src_pte; - struct page *page; - - /* pte contains position in swap or file, so copy. */ - if (unlikely(!pte_present(pte))) { - if (!pte_file(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - - swap_duplicate(entry); - /* make sure dst_mm is on swapoff's mmlist. */ - if (unlikely(list_empty(&dst_mm->mmlist))) { - spin_lock(&mmlist_lock); - if (list_empty(&dst_mm->mmlist)) - list_add(&dst_mm->mmlist, - &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (is_write_migration_entry(entry) && - is_cow_mapping(vm_flags)) { - /* - * COW mappings require pages in both parent - * and child to be set to read. - */ - make_migration_entry_read(&entry); - pte = swp_entry_to_pte(entry); - set_pte_at(src_mm, addr, src_pte, pte); - } - } - goto out_set_pte; - } - - /* - * If it's a COW mapping, write protect it both - * in the parent and the child - */ - if (is_cow_mapping(vm_flags)) { - ptep_set_wrprotect(src_mm, addr, src_pte); - pte = pte_wrprotect(pte); - } - - /* - * If it's a shared mapping, mark it clean in - * the child - */ - if (vm_flags & VM_SHARED) - pte = pte_mkclean(pte); - pte = pte_mkold(pte); - - page = vm_normal_page(vma, addr, pte); - if (page) { - get_page(page); - page_dup_rmap(page, vma, addr); - rss[!!PageAnon(page)]++; - } - -out_set_pte: - set_pte_at(dst_mm, addr, dst_pte, pte); -} - -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - pte_t *src_pte, *dst_pte; - spinlock_t *src_ptl, *dst_ptl; - int progress = 0; - int rss[2]; - -again: - rss[1] = rss[0] = 0; - dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); - if (!dst_pte) - return -ENOMEM; - src_pte = pte_offset_map_nested(src_pmd, addr); - src_ptl = pte_lockptr(src_mm, src_pmd); - spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - arch_enter_lazy_mmu_mode(); - - do { - /* - * We are holding two locks at this point - either of them - * could generate latencies in another task on another CPU. - */ - if (progress >= 32) { - progress = 0; - if (need_resched() || - spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) - break; - } - if (pte_none(*src_pte)) { - progress++; - continue; - } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); - progress += 8; - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); - - arch_leave_lazy_mmu_mode(); - spin_unlock(src_ptl); - pte_unmap_nested(src_pte - 1); - add_mm_rss(dst_mm, rss[0], rss[1]); - pte_unmap_unlock(dst_pte - 1, dst_ptl); - cond_resched(); - if (addr != end) - goto again; - return 0; -} - -static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - pmd_t *src_pmd, *dst_pmd; - unsigned long next; - - dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); - if (!dst_pmd) - return -ENOMEM; - src_pmd = pmd_offset(src_pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(src_pmd)) - continue; - if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) - return -ENOMEM; - } while (dst_pmd++, src_pmd++, addr = next, addr != end); - return 0; -} - -static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - pud_t *src_pud, *dst_pud; - unsigned long next; - - dst_pud = pud_alloc(dst_mm, dst_pgd, addr); - if (!dst_pud) - return -ENOMEM; - src_pud = pud_offset(src_pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(src_pud)) - continue; - if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) - return -ENOMEM; - } while (dst_pud++, src_pud++, addr = next, addr != end); - return 0; -} - -int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) -{ - pgd_t *src_pgd, *dst_pgd; - unsigned long next; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; - int ret; - - /* - * Don't copy ptes where a page fault will fill them correctly. - * Fork becomes much lighter when there are big shared or private - * readonly mappings. The tradeoff is that copy_page_range is more - * efficient than faulting. - */ - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { - if (!vma->anon_vma) - return 0; - } - - if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst_mm, src_mm, vma); - - if (unlikely(is_pfn_mapping(vma))) { - /* - * We do not free on error cases below as remove_vma - * gets called on error from higher level routine - */ - ret = track_pfn_vma_copy(vma); - if (ret) - return ret; - } - - /* - * We need to invalidate the secondary MMU mappings only when - * there could be a permission downgrade on the ptes of the - * parent mm. And a permission downgrade will only happen if - * is_cow_mapping() returns true. - */ - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_start(src_mm, addr, end); - - ret = 0; - dst_pgd = pgd_offset(dst_mm, addr); - src_pgd = pgd_offset(src_mm, addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(src_pgd)) - continue; - if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next))) { - ret = -ENOMEM; - break; - } - } while (dst_pgd++, src_pgd++, addr = next, addr != end); - - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_end(src_mm, - vma->vm_start, end); - return ret; -} - -static unsigned long zap_pte_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) -{ - struct mm_struct *mm = tlb->mm; - pte_t *pte; - spinlock_t *ptl; - int file_rss = 0; - int anon_rss = 0; - - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - arch_enter_lazy_mmu_mode(); - do { - pte_t ptent = *pte; - if (pte_none(ptent)) { - (*zap_work)--; - continue; - } - - (*zap_work) -= PAGE_SIZE; - - if (pte_present(ptent)) { - struct page *page; - - page = vm_normal_page(vma, addr, ptent); - if (unlikely(details) && page) { - /* - * unmap_shared_mapping_pages() wants to - * invalidate cache without truncating: - * unmap shared but keep private pages. - */ - if (details->check_mapping && - details->check_mapping != page->mapping) - continue; - /* - * Each page->index must be checked when - * invalidating or truncating nonlinear. - */ - if (details->nonlinear_vma && - (page->index < details->first_index || - page->index > details->last_index)) - continue; - } - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - tlb_remove_tlb_entry(tlb, pte, addr); - if (unlikely(!page)) - continue; - if (unlikely(details) && details->nonlinear_vma - && linear_page_index(details->nonlinear_vma, - addr) != page->index) - set_pte_at(mm, addr, pte, - pgoff_to_pte(page->index)); - if (PageAnon(page)) - anon_rss--; - else { - if (pte_dirty(ptent)) - set_page_dirty(page); - if (pte_young(ptent) && - likely(!VM_SequentialReadHint(vma))) - mark_page_accessed(page); - file_rss--; - } - page_remove_rmap(page); - if (unlikely(page_mapcount(page) < 0)) - print_bad_pte(vma, addr, ptent, page); - tlb_remove_page(tlb, page); - continue; - } - /* - * If details->check_mapping, we leave swap entries; - * if details->nonlinear_vma, we leave file entries. - */ - if (unlikely(details)) - continue; - if (pte_file(ptent)) { - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) - print_bad_pte(vma, addr, ptent, NULL); - } else if - (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) - print_bad_pte(vma, addr, ptent, NULL); - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); - } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); - - add_mm_rss(mm, file_rss, anon_rss); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(pte - 1, ptl); - - return addr; -} - -static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) { - (*zap_work)--; - continue; - } - next = zap_pte_range(tlb, vma, pmd, addr, next, - zap_work, details); - } while (pmd++, addr = next, (addr != end && *zap_work > 0)); - - return addr; -} - -static inline unsigned long zap_pud_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) -{ - pud_t *pud; - unsigned long next; - - pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) { - (*zap_work)--; - continue; - } - next = zap_pmd_range(tlb, vma, pud, addr, next, - zap_work, details); - } while (pud++, addr = next, (addr != end && *zap_work > 0)); - - return addr; -} - -static unsigned long unmap_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) -{ - pgd_t *pgd; - unsigned long next; - - if (details && !details->check_mapping && !details->nonlinear_vma) - details = NULL; - - BUG_ON(addr >= end); - tlb_start_vma(tlb, vma); - pgd = pgd_offset(vma->vm_mm, addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) { - (*zap_work)--; - continue; - } - next = zap_pud_range(tlb, vma, pgd, addr, next, - zap_work, details); - } while (pgd++, addr = next, (addr != end && *zap_work > 0)); - tlb_end_vma(tlb, vma); - - return addr; -} - -#ifdef CONFIG_PREEMPT -# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) -#else -/* No preempt: go for improved straight-line efficiency */ -# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) -#endif - -/** - * unmap_vmas - unmap a range of memory covered by a list of vma's - * @tlbp: address of the caller's struct mmu_gather - * @vma: the starting vma - * @start_addr: virtual address at which to start unmapping - * @end_addr: virtual address at which to end unmapping - * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here - * @details: details of nonlinear truncation or shared cache invalidation - * - * Returns the end address of the unmapping (restart addr if interrupted). - * - * Unmap all pages in the vma list. - * - * We aim to not hold locks for too long (for scheduling latency reasons). - * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to - * return the ending mmu_gather to the caller. - * - * Only addresses between `start' and `end' will be unmapped. - * - * The VMA list must be sorted in ascending virtual address order. - * - * unmap_vmas() assumes that the caller will flush the whole unmapped address - * range after unmap_vmas() returns. So the only responsibility here is to - * ensure that any thus-far unmapped pages are flushed before unmap_vmas() - * drops the lock and schedules. - */ -unsigned long unmap_vmas(struct mmu_gather **tlbp, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long *nr_accounted, - struct zap_details *details) -{ - long zap_work = ZAP_BLOCK_SIZE; - unsigned long tlb_start = 0; /* For tlb_finish_mmu */ - int tlb_start_valid = 0; - unsigned long start = start_addr; - spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; - int fullmm = (*tlbp)->fullmm; - struct mm_struct *mm = vma->vm_mm; - - mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); - for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { - unsigned long end; - - start = max(vma->vm_start, start_addr); - if (start >= vma->vm_end) - continue; - end = min(vma->vm_end, end_addr); - if (end <= vma->vm_start) - continue; - - if (vma->vm_flags & VM_ACCOUNT) - *nr_accounted += (end - start) >> PAGE_SHIFT; - - if (unlikely(is_pfn_mapping(vma))) - untrack_pfn_vma(vma, 0, 0); - - while (start != end) { - if (!tlb_start_valid) { - tlb_start = start; - tlb_start_valid = 1; - } - - if (unlikely(is_vm_hugetlb_page(vma))) { - /* - * It is undesirable to test vma->vm_file as it - * should be non-null for valid hugetlb area. - * However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When - * hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file - * before calling this function to clean up. - * Since no pte has actually been setup, it is - * safe to do nothing in this case. - */ - if (vma->vm_file) { - unmap_hugepage_range(vma, start, end, NULL); - zap_work -= (end - start) / - pages_per_huge_page(hstate_vma(vma)); - } - - start = end; - } else - start = unmap_page_range(*tlbp, vma, - start, end, &zap_work, details); - - if (zap_work > 0) { - BUG_ON(start != end); - break; - } - - tlb_finish_mmu(*tlbp, tlb_start, start); - - if (need_resched() || - (i_mmap_lock && spin_needbreak(i_mmap_lock))) { - if (i_mmap_lock) { - *tlbp = NULL; - goto out; - } - cond_resched(); - } - - *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); - tlb_start_valid = 0; - zap_work = ZAP_BLOCK_SIZE; - } - } -out: - mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); - return start; /* which is now the end (or restart) address */ -} - -/** - * zap_page_range - remove user pages in a given range - * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap - * @size: number of bytes to zap - * @details: details of nonlinear truncation or shared cache invalidation - */ -unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *details) -{ - struct mm_struct *mm = vma->vm_mm; - struct mmu_gather *tlb; - unsigned long end = address + size; - unsigned long nr_accounted = 0; - - lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); - update_hiwater_rss(mm); - end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); - if (tlb) - tlb_finish_mmu(tlb, address, end); - return end; -} - -/** - * zap_vma_ptes - remove ptes mapping the vma - * @vma: vm_area_struct holding ptes to be zapped - * @address: starting address of pages to zap - * @size: number of bytes to zap - * - * This function only unmaps ptes assigned to VM_PFNMAP vmas. - * - * The entire address range must be fully contained within the vma. - * - * Returns 0 if successful. - */ -int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, - unsigned long size) -{ - if (address < vma->vm_start || address + size > vma->vm_end || - !(vma->vm_flags & VM_PFNMAP)) - return -1; - zap_page_range(vma, address, size, NULL); - return 0; -} -EXPORT_SYMBOL_GPL(zap_vma_ptes); - -/* - * Do a quick page-table lookup for a single page. - */ -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - struct page *page; - struct mm_struct *mm = vma->vm_mm; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - goto out; - } - - page = NULL; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto no_page_table; - - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - goto no_page_table; - if (pud_huge(*pud)) { - BUG_ON(flags & FOLL_GET); - page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); - goto out; - } - if (unlikely(pud_bad(*pud))) - goto no_page_table; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto no_page_table; - if (pmd_huge(*pmd)) { - BUG_ON(flags & FOLL_GET); - page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); - goto out; - } - if (unlikely(pmd_bad(*pmd))) - goto no_page_table; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - - pte = *ptep; - if (!pte_present(pte)) - goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) - goto bad_page; - - if (flags & FOLL_GET) - get_page(page); - if (flags & FOLL_TOUCH) { - if ((flags & FOLL_WRITE) && - !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - mark_page_accessed(page); - } -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return page; - -bad_page: - pte_unmap_unlock(ptep, ptl); - return ERR_PTR(-EFAULT); - -no_page: - pte_unmap_unlock(ptep, ptl); - if (!pte_none(pte)) - return page; - /* Fall through to ZERO_PAGE handling */ -no_page_table: - /* - * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate page tables. - */ - if (flags & FOLL_ANON) { - page = ZERO_PAGE(0); - if (flags & FOLL_GET) - get_page(page); - BUG_ON(flags & FOLL_WRITE); - } - return page; -} - -/* Can we do the FOLL_ANON optimization? */ -static inline int use_zero_page(struct vm_area_struct *vma) -{ - /* - * We don't want to optimize FOLL_ANON for make_pages_present() - * when it tries to page in a VM_LOCKED region. As to VM_SHARED, - * we want to get the page from the page tables to make sure - * that we serialize and update with any other user of that - * mapping. - */ - if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) - return 0; - /* - * And if we have a fault routine, it's not an anonymous region. - */ - return !vma->vm_ops || !vma->vm_ops->fault; -} - - - -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, - struct page **pages, struct vm_area_struct **vmas) -{ - int i; - unsigned int vm_flags = 0; - int write = !!(flags & GUP_FLAGS_WRITE); - int force = !!(flags & GUP_FLAGS_FORCE); - int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); - int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); - - if (len <= 0) - return 0; - /* - * Require read or write permissions. - * If 'force' is set, we only require the "MAY" flags. - */ - vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - i = 0; - - do { - struct vm_area_struct *vma; - unsigned int foll_flags; - - vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(tsk, start)) { - unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(tsk); - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* user gate pages are read-only */ - if (!ignore && write) - return i ? : -EFAULT; - if (pg > TASK_SIZE) - pgd = pgd_offset_k(pg); - else - pgd = pgd_offset_gate(mm, pg); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, pg); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, pg); - if (pmd_none(*pmd)) - return i ? : -EFAULT; - pte = pte_offset_map(pmd, pg); - if (pte_none(*pte)) { - pte_unmap(pte); - return i ? : -EFAULT; - } - if (pages) { - struct page *page = vm_normal_page(gate_vma, start, *pte); - pages[i] = page; - if (page) - get_page(page); - } - pte_unmap(pte); - if (vmas) - vmas[i] = gate_vma; - i++; - start += PAGE_SIZE; - len--; - continue; - } - - if (!vma || - (vma->vm_flags & (VM_IO | VM_PFNMAP)) || - (!ignore && !(vm_flags & vma->vm_flags))) - return i ? : -EFAULT; - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &len, i, write); - continue; - } - - foll_flags = FOLL_TOUCH; - if (pages) - foll_flags |= FOLL_GET; - if (!write && use_zero_page(vma)) - foll_flags |= FOLL_ANON; - - do { - struct page *page; - - /* - * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory, unless - * current is handling munlock--e.g., on exit. In - * that case, we are not allocating memory. Rather, - * we're only unlocking already resident/mapped pages. - */ - if (unlikely(!ignore_sigkill && - fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; - - if (write) - foll_flags |= FOLL_WRITE; - - cond_resched(); - while (!(page = follow_page(vma, start, foll_flags))) { - int ret; - ret = handle_mm_fault(mm, vma, start, - foll_flags & FOLL_WRITE); - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return i ? i : -ENOMEM; - else if (ret & VM_FAULT_SIGBUS) - return i ? i : -EFAULT; - BUG(); - } - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - - /* - * The VM_FAULT_WRITE bit tells us that - * do_wp_page has broken COW when necessary, - * even if maybe_mkwrite decided not to set - * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. But only - * do so when looping for pte_write is futile: - * in some cases userspace may also be wanting - * to write to the gotten user page, which a - * read fault here might prevent (a readonly - * page might get reCOWed by userspace write). - */ - if ((ret & VM_FAULT_WRITE) && - !(vma->vm_flags & VM_WRITE)) - foll_flags &= ~FOLL_WRITE; - - cond_resched(); - } - if (IS_ERR(page)) - return i ? i : PTR_ERR(page); - if (pages) { - pages[i] = page; - - flush_anon_page(vma, page, start); - flush_dcache_page(page); - } - if (vmas) - vmas[i] = vma; - i++; - start += PAGE_SIZE; - len--; - } while (len && start < vma->vm_end); - } while (len); - return i; -} - -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) -{ - int flags = 0; - - if (write) - flags |= GUP_FLAGS_WRITE; - if (force) - flags |= GUP_FLAGS_FORCE; - - return __get_user_pages(tsk, mm, - start, len, flags, - pages, vmas); -} - -EXPORT_SYMBOL(get_user_pages); - -pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, - spinlock_t **ptl) -{ - pgd_t * pgd = pgd_offset(mm, addr); - pud_t * pud = pud_alloc(mm, pgd, addr); - if (pud) { - pmd_t * pmd = pmd_alloc(mm, pud, addr); - if (pmd) - return pte_alloc_map_lock(mm, pmd, addr, ptl); - } - return NULL; -} - -/* - * This is the old fallback for page remapping. - * - * For historical reasons, it only allows reserved pages. Only - * old drivers should use this, and they needed to mark their - * pages reserved for the old functions anyway. - */ -static int insert_page(struct vm_area_struct *vma, unsigned long addr, - struct page *page, pgprot_t prot) -{ - struct mm_struct *mm = vma->vm_mm; - int retval; - pte_t *pte; - spinlock_t *ptl; - - retval = -EINVAL; - if (PageAnon(page)) - goto out; - retval = -ENOMEM; - flush_dcache_page(page); - pte = get_locked_pte(mm, addr, &ptl); - if (!pte) - goto out; - retval = -EBUSY; - if (!pte_none(*pte)) - goto out_unlock; - - /* Ok, finally just insert the thing.. */ - get_page(page); - inc_mm_counter(mm, file_rss); - page_add_file_rmap(page); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); - - retval = 0; - pte_unmap_unlock(pte, ptl); - return retval; -out_unlock: - pte_unmap_unlock(pte, ptl); -out: - return retval; -} - -/** - * vm_insert_page - insert single page into user vma - * @vma: user vma to map to - * @addr: target user address of this page - * @page: source kernel page - * - * This allows drivers to insert individual pages they've allocated - * into a user vma. - * - * The page has to be a nice clean _individual_ kernel allocation. - * If you allocate a compound page, you need to have marked it as - * such (__GFP_COMP), or manually just split the page up yourself - * (see split_page()). - * - * NOTE! Traditionally this was done with "remap_pfn_range()" which - * took an arbitrary page protection parameter. This doesn't allow - * that. Your vma protection will have to be set up correctly, which - * means that if you want a shared writable mapping, you'd better - * ask for a shared writable mapping! - * - * The page does not need to be reserved. - */ -int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, - struct page *page) -{ - if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; - if (!page_count(page)) - return -EINVAL; - vma->vm_flags |= VM_INSERTPAGE; - return insert_page(vma, addr, page, vma->vm_page_prot); -} -EXPORT_SYMBOL(vm_insert_page); - -static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, pgprot_t prot) -{ - struct mm_struct *mm = vma->vm_mm; - int retval; - pte_t *pte, entry; - spinlock_t *ptl; - - retval = -ENOMEM; - pte = get_locked_pte(mm, addr, &ptl); - if (!pte) - goto out; - retval = -EBUSY; - if (!pte_none(*pte)) - goto out_unlock; - - /* Ok, finally just insert the thing.. */ - entry = pte_mkspecial(pfn_pte(pfn, prot)); - set_pte_at(mm, addr, pte, entry); - update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ - - retval = 0; -out_unlock: - pte_unmap_unlock(pte, ptl); -out: - return retval; -} - -/** - * vm_insert_pfn - insert single pfn into user vma - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * - * Similar to vm_inert_page, this allows drivers to insert individual pages - * they've allocated into a user vma. Same comments apply. - * - * This function should only be called from a vm_ops->fault handler, and - * in that case the handler should return NULL. - * - * vma cannot be a COW mapping. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - */ -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) -{ - int ret; - pgprot_t pgprot = vma->vm_page_prot; - /* - * Technically, architectures with pte_special can avoid all these - * restrictions (same for remap_pfn_range). However we would like - * consistency in testing and feature parity among all, so we should - * try to keep these invariants in place for everybody. - */ - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); - BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == - (VM_PFNMAP|VM_MIXEDMAP)); - BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); - - if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; - if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) - return -EINVAL; - - ret = insert_pfn(vma, addr, pfn, pgprot); - - if (ret) - untrack_pfn_vma(vma, pfn, PAGE_SIZE); - - return ret; -} -EXPORT_SYMBOL(vm_insert_pfn); - -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) -{ - BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); - - if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; - - /* - * If we don't have pte special, then we have to use the pfn_valid() - * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* - * refcount the page if pfn_valid is true (hence insert_page rather - * than insert_pfn). - */ - if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { - struct page *page; - - page = pfn_to_page(pfn); - return insert_page(vma, addr, page, vma->vm_page_prot); - } - return insert_pfn(vma, addr, pfn, vma->vm_page_prot); -} -EXPORT_SYMBOL(vm_insert_mixed); - -/* - * maps a range of physical memory into the requested pages. the old - * mappings are removed. any references to nonexistent pages results - * in null mappings (currently treated as "copy-on-access") - */ -static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, - unsigned long pfn, pgprot_t prot) -{ - pte_t *pte; - spinlock_t *ptl; - - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); - if (!pte) - return -ENOMEM; - arch_enter_lazy_mmu_mode(); - do { - BUG_ON(!pte_none(*pte)); - set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); - pfn++; - } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(pte - 1, ptl); - return 0; -} - -static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, - unsigned long pfn, pgprot_t prot) -{ - pmd_t *pmd; - unsigned long next; - - pfn -= addr >> PAGE_SHIFT; - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - return -ENOMEM; - do { - next = pmd_addr_end(addr, end); - if (remap_pte_range(mm, pmd, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; - } while (pmd++, addr = next, addr != end); - return 0; -} - -static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, - unsigned long pfn, pgprot_t prot) -{ - pud_t *pud; - unsigned long next; - - pfn -= addr >> PAGE_SHIFT; - pud = pud_alloc(mm, pgd, addr); - if (!pud) - return -ENOMEM; - do { - next = pud_addr_end(addr, end); - if (remap_pmd_range(mm, pud, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; - } while (pud++, addr = next, addr != end); - return 0; -} - -/** - * remap_pfn_range - remap kernel memory to userspace - * @vma: user vma to map to - * @addr: target user address to start at - * @pfn: physical address of kernel memory - * @size: size of map area - * @prot: page protection flags for this mapping - * - * Note: this is only safe if the mm semaphore is held when called. - */ -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) -{ - pgd_t *pgd; - unsigned long next; - unsigned long end = addr + PAGE_ALIGN(size); - struct mm_struct *mm = vma->vm_mm; - int err; - - /* - * Physically remapped pages are special. Tell the - * rest of the world about it: - * VM_IO tells people not to look at these pages - * (accesses can have side effects). - * VM_RESERVED is specified all over the place, because - * in 2.4 it kept swapout's vma scan off this vma; but - * in 2.6 the LRU scan won't even find its pages, so this - * flag means no more than count its pages in reserved_vm, - * and omit it from core dump, even when VM_IO turned off. - * VM_PFNMAP tells the core MM that the base pages are just - * raw PFN mappings, and do not have a "struct page" associated - * with them. - * - * There's a horrible special case to handle copy-on-write - * behaviour that some programs depend on. We mark the "original" - * un-COW'ed pages by matching them up with "vma->vm_pgoff". - */ - if (addr == vma->vm_start && end == vma->vm_end) - vma->vm_pgoff = pfn; - else if (is_cow_mapping(vma->vm_flags)) - return -EINVAL; - - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - - err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); - if (err) { - /* - * To indicate that track_pfn related cleanup is not - * needed from higher level routine calling unmap_vmas - */ - vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); - return -EINVAL; - } - - BUG_ON(addr >= end); - pfn -= addr >> PAGE_SHIFT; - pgd = pgd_offset(mm, addr); - flush_cache_range(vma, addr, end); - do { - next = pgd_addr_end(addr, end); - err = remap_pud_range(mm, pgd, addr, next, - pfn + (addr >> PAGE_SHIFT), prot); - if (err) - break; - } while (pgd++, addr = next, addr != end); - - if (err) - untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); - - return err; -} -EXPORT_SYMBOL(remap_pfn_range); - -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) -{ - pte_t *pte; - int err; - pgtable_t token; - spinlock_t *uninitialized_var(ptl); - - pte = (mm == &init_mm) ? - pte_alloc_kernel(pmd, addr) : - pte_alloc_map_lock(mm, pmd, addr, &ptl); - if (!pte) - return -ENOMEM; - - BUG_ON(pmd_huge(*pmd)); - - arch_enter_lazy_mmu_mode(); - - token = pmd_pgtable(*pmd); - - do { - err = fn(pte, token, addr, data); - if (err) - break; - } while (pte++, addr += PAGE_SIZE, addr != end); - - arch_leave_lazy_mmu_mode(); - - if (mm != &init_mm) - pte_unmap_unlock(pte-1, ptl); - return err; -} - -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) -{ - pmd_t *pmd; - unsigned long next; - int err; - - BUG_ON(pud_huge(*pud)); - - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - return -ENOMEM; - do { - next = pmd_addr_end(addr, end); - err = apply_to_pte_range(mm, pmd, addr, next, fn, data); - if (err) - break; - } while (pmd++, addr = next, addr != end); - return err; -} - -static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) -{ - pud_t *pud; - unsigned long next; - int err; - - pud = pud_alloc(mm, pgd, addr); - if (!pud) - return -ENOMEM; - do { - next = pud_addr_end(addr, end); - err = apply_to_pmd_range(mm, pud, addr, next, fn, data); - if (err) - break; - } while (pud++, addr = next, addr != end); - return err; -} - -/* - * Scan a region of virtual memory, filling in page tables as necessary - * and calling a provided function on each leaf page table. - */ -int apply_to_page_range(struct mm_struct *mm, unsigned long addr, - unsigned long size, pte_fn_t fn, void *data) -{ - pgd_t *pgd; - unsigned long next; - unsigned long start = addr, end = addr + size; - int err; - - BUG_ON(addr >= end); - mmu_notifier_invalidate_range_start(mm, start, end); - pgd = pgd_offset(mm, addr); - do { - next = pgd_addr_end(addr, end); - err = apply_to_pud_range(mm, pgd, addr, next, fn, data); - if (err) - break; - } while (pgd++, addr = next, addr != end); - mmu_notifier_invalidate_range_end(mm, start, end); - return err; -} -EXPORT_SYMBOL_GPL(apply_to_page_range); - -/* - * handle_pte_fault chooses page fault handler according to an entry - * which was read non-atomically. Before making any commitment, on - * those architectures or configurations (e.g. i386 with PAE) which - * might give a mix of unmatched parts, do_swap_page and do_file_page - * must check under lock before unmapping the pte and proceeding - * (but do_wp_page is only called after already making such a check; - * and do_anonymous_page and do_no_page can safely check later on). - */ -static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, - pte_t *page_table, pte_t orig_pte) -{ - int same = 1; -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) - if (sizeof(pte_t) > sizeof(unsigned long)) { - spinlock_t *ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - same = pte_same(*page_table, orig_pte); - spin_unlock(ptl); - } -#endif - pte_unmap(page_table); - return same; -} - -/* - * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when - * servicing faults for write access. In the normal case, do always want - * pte_mkwrite. But get_user_pages can cause write faults for mappings - * that do not have writing enabled, when used by access_process_vm. - */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) -{ - if (likely(vma->vm_flags & VM_WRITE)) - pte = pte_mkwrite(pte); - return pte; -} - -static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) -{ - /* - * If the source page was a PFN mapping, we don't have - * a "struct page" for it. We do a best-effort copy by - * just copying from the original user address. If that - * fails, we just zero-fill it. Live with it. - */ - if (unlikely(!src)) { - void *kaddr = kmap_atomic(dst, KM_USER0); - void __user *uaddr = (void __user *)(va & PAGE_MASK); - - /* - * This really shouldn't fail, because the page is there - * in the page tables. But it might just be unreadable, - * in which case we just give up and fill the result with - * zeroes. - */ - if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - memset(kaddr, 0, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - flush_dcache_page(dst); - } else - copy_user_highpage(dst, src, va, vma); -} - -/* - * This routine handles present pages, when users try to write - * to a shared page. It is done by copying the page to a new address - * and decrementing the shared-page counter for the old page. - * - * Note that this routine assumes that the protection checks have been - * done by the caller (the low-level page fault routine in most cases). - * Thus we can safely just mark it writable once we've done any necessary - * COW. - * - * We also mark the page dirty at this point even though the page will - * change only once the write actually happens. This avoids a few races, - * and potentially makes it more efficient. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), with pte both mapped and locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - spinlock_t *ptl, pte_t orig_pte) -{ - struct page *old_page, *new_page; - pte_t entry; - int reuse = 0, ret = 0; - int page_mkwrite = 0; - struct page *dirty_page = NULL; - - old_page = vm_normal_page(vma, address, orig_pte); - if (!old_page) { - /* - * VM_MIXEDMAP !pfn_valid() case - * - * We should not cow pages in a shared writeable mapping. - * Just mark the pages writable as we can't do any dirty - * accounting on raw pfn maps. - */ - if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED)) - goto reuse; - goto gotten; - } - - /* - * Take out anonymous pages first, anonymous shared vmas are - * not dirty accountable. - */ - if (PageAnon(old_page)) { - if (!trylock_page(old_page)) { - page_cache_get(old_page); - pte_unmap_unlock(page_table, ptl); - lock_page(old_page); - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - if (!pte_same(*page_table, orig_pte)) { - unlock_page(old_page); - page_cache_release(old_page); - goto unlock; - } - page_cache_release(old_page); - } - reuse = reuse_swap_page(old_page); - unlock_page(old_page); - } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED))) { - /* - * Only catch write-faults on shared writable pages, - * read-only shared pages can get COWed by - * get_user_pages(.write=1, .force=1). - */ - if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - /* - * Notify the address space that the page is about to - * become writable so that it can prohibit this or wait - * for the page to get into an appropriate state. - * - * We do this without the lock held, so that it can - * sleep if it needs to. - */ - page_cache_get(old_page); - pte_unmap_unlock(page_table, ptl); - - if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) - goto unwritable_page; - - /* - * Since we dropped the lock we need to revalidate - * the PTE as someone else may have changed it. If - * they did, we just return, as we can count on the - * MMU to tell us if they didn't also make it writable. - */ - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - page_cache_release(old_page); - if (!pte_same(*page_table, orig_pte)) - goto unlock; - - page_mkwrite = 1; - } - dirty_page = old_page; - get_page(dirty_page); - reuse = 1; - } - - if (reuse) { -reuse: - flush_cache_page(vma, address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, address, page_table, entry,1)) - update_mmu_cache(vma, address, entry); - ret |= VM_FAULT_WRITE; - goto unlock; - } - - /* - * Ok, we need to copy. Oh, well.. - */ - page_cache_get(old_page); -gotten: - pte_unmap_unlock(page_table, ptl); - - if (unlikely(anon_vma_prepare(vma))) - goto oom; - VM_BUG_ON(old_page == ZERO_PAGE(0)); - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!new_page) - goto oom; - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if ((vma->vm_flags & VM_LOCKED) && old_page) { - lock_page(old_page); /* for LRU manipulation */ - clear_page_mlock(old_page); - unlock_page(old_page); - } - cow_user_page(new_page, old_page, address, vma); - __SetPageUptodate(new_page); - - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) - goto oom_free_new; - - /* - * Re-check the pte - we dropped the lock - */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (likely(pte_same(*page_table, orig_pte))) { - if (old_page) { - if (!PageAnon(old_page)) { - dec_mm_counter(mm, file_rss); - inc_mm_counter(mm, anon_rss); - } - } else - inc_mm_counter(mm, anon_rss); - flush_cache_page(vma, address, pte_pfn(orig_pte)); - entry = mk_pte(new_page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - /* - * Clear the pte entry and flush it first, before updating the - * pte with the new entry. This will avoid a race condition - * seen in the presence of one thread doing SMC and another - * thread doing COW. - */ - ptep_clear_flush_notify(vma, address, page_table); - page_add_new_anon_rmap(new_page, vma, address); - set_pte_at(mm, address, page_table, entry); - update_mmu_cache(vma, address, entry); - if (old_page) { - /* - * Only after switching the pte to the new page may - * we remove the mapcount here. Otherwise another - * process may come and find the rmap count decremented - * before the pte is switched to the new page, and - * "reuse" the old page writing into it while our pte - * here still points into it and can be read by other - * threads. - * - * The critical issue is to order this - * page_remove_rmap with the ptp_clear_flush above. - * Those stores are ordered by (if nothing else,) - * the barrier present in the atomic_add_negative - * in page_remove_rmap. - * - * Then the TLB flush in ptep_clear_flush ensures that - * no process can access the old page before the - * decremented mapcount is visible. And the old page - * cannot be reused until after the decremented - * mapcount is visible. So transitively, TLBs to - * old page will be flushed before it can be reused. - */ - page_remove_rmap(old_page); - } - - /* Free the old page.. */ - new_page = old_page; - ret |= VM_FAULT_WRITE; - } else - mem_cgroup_uncharge_page(new_page); - - if (new_page) - page_cache_release(new_page); - if (old_page) - page_cache_release(old_page); -unlock: - pte_unmap_unlock(page_table, ptl); - if (dirty_page) { - if (vma->vm_file) - file_update_time(vma->vm_file); - - /* - * Yes, Virginia, this is actually required to prevent a race - * with clear_page_dirty_for_io() from clearing the page dirty - * bit after it clear all dirty ptes, but before a racing - * do_wp_page installs a dirty pte. - * - * do_no_page is protected similarly. - */ - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); - put_page(dirty_page); - } - return ret; -oom_free_new: - page_cache_release(new_page); -oom: - if (old_page) - page_cache_release(old_page); - return VM_FAULT_OOM; - -unwritable_page: - page_cache_release(old_page); - return VM_FAULT_SIGBUS; -} - -/* - * Helper functions for unmap_mapping_range(). - * - * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ - * - * We have to restart searching the prio_tree whenever we drop the lock, - * since the iterator is only valid while the lock is held, and anyway - * a later vma might be split and reinserted earlier while lock dropped. - * - * The list of nonlinear vmas could be handled more efficiently, using - * a placeholder, but handle it in the same way until a need is shown. - * It is important to search the prio_tree before nonlinear list: a vma - * may become nonlinear and be shifted from prio_tree to nonlinear list - * while the lock is dropped; but never shifted from list to prio_tree. - * - * In order to make forward progress despite restarting the search, - * vm_truncate_count is used to mark a vma as now dealt with, so we can - * quickly skip it next time around. Since the prio_tree search only - * shows us those vmas affected by unmapping the range in question, we - * can't efficiently keep all vmas in step with mapping->truncate_count: - * so instead reset them all whenever it wraps back to 0 (then go to 1). - * mapping->truncate_count and vma->vm_truncate_count are protected by - * i_mmap_lock. - * - * In order to make forward progress despite repeatedly restarting some - * large vma, note the restart_addr from unmap_vmas when it breaks out: - * and restart from that address when we reach that vma again. It might - * have been split or merged, shrunk or extended, but never shifted: so - * restart_addr remains valid so long as it remains in the vma's range. - * unmap_mapping_range forces truncate_count to leap over page-aligned - * values so we can save vma's restart_addr in its truncate_count field. - */ -#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) - -static void reset_vma_truncate_counts(struct address_space *mapping) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) - vma->vm_truncate_count = 0; - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) - vma->vm_truncate_count = 0; -} - -static int unmap_mapping_range_vma(struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr, - struct zap_details *details) -{ - unsigned long restart_addr; - int need_break; - - /* - * files that support invalidating or truncating portions of the - * file from under mmaped areas must have their ->fault function - * return a locked page (and set VM_FAULT_LOCKED in the return). - * This provides synchronisation against concurrent unmapping here. - */ - -again: - restart_addr = vma->vm_truncate_count; - if (is_restart_addr(restart_addr) && start_addr < restart_addr) { - start_addr = restart_addr; - if (start_addr >= end_addr) { - /* Top of vma has been split off since last time */ - vma->vm_truncate_count = details->truncate_count; - return 0; - } - } - - restart_addr = zap_page_range(vma, start_addr, - end_addr - start_addr, details); - need_break = need_resched() || spin_needbreak(details->i_mmap_lock); - - if (restart_addr >= end_addr) { - /* We have now completed this vma: mark it so */ - vma->vm_truncate_count = details->truncate_count; - if (!need_break) - return 0; - } else { - /* Note restart_addr in vma's truncate_count field */ - vma->vm_truncate_count = restart_addr; - if (!need_break) - goto again; - } - - spin_unlock(details->i_mmap_lock); - cond_resched(); - spin_lock(details->i_mmap_lock); - return -EINTR; -} - -static inline void unmap_mapping_range_tree(struct prio_tree_root *root, - struct zap_details *details) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - pgoff_t vba, vea, zba, zea; - -restart: - vma_prio_tree_foreach(vma, &iter, root, - details->first_index, details->last_index) { - /* Skip quickly over those we have already dealt with */ - if (vma->vm_truncate_count == details->truncate_count) - continue; - - vba = vma->vm_pgoff; - vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; - /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ - zba = details->first_index; - if (zba < vba) - zba = vba; - zea = details->last_index; - if (zea > vea) - zea = vea; - - if (unmap_mapping_range_vma(vma, - ((zba - vba) << PAGE_SHIFT) + vma->vm_start, - ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, - details) < 0) - goto restart; - } -} - -static inline void unmap_mapping_range_list(struct list_head *head, - struct zap_details *details) -{ - struct vm_area_struct *vma; - - /* - * In nonlinear VMAs there is no correspondence between virtual address - * offset and file offset. So we must perform an exhaustive search - * across *all* the pages in each nonlinear VMA, not just the pages - * whose virtual address lies outside the file truncation point. - */ -restart: - list_for_each_entry(vma, head, shared.vm_set.list) { - /* Skip quickly over those we have already dealt with */ - if (vma->vm_truncate_count == details->truncate_count) - continue; - details->nonlinear_vma = vma; - if (unmap_mapping_range_vma(vma, vma->vm_start, - vma->vm_end, details) < 0) - goto restart; - } -} - -/** - * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. - * @mapping: the address space containing mmaps to be unmapped. - * @holebegin: byte in first page to unmap, relative to the start of - * the underlying file. This will be rounded down to a PAGE_SIZE - * boundary. Note that this is different from vmtruncate(), which - * must keep the partial page. In contrast, we must get rid of - * partial pages. - * @holelen: size of prospective hole in bytes. This will be rounded - * up to a PAGE_SIZE boundary. A holelen of zero truncates to the - * end of the file. - * @even_cows: 1 when truncating a file, unmap even private COWed pages; - * but 0 when invalidating pagecache, don't throw away private data. - */ -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows) -{ - struct zap_details details; - pgoff_t hba = holebegin >> PAGE_SHIFT; - pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; - - /* Check for overflow. */ - if (sizeof(holelen) > sizeof(hlen)) { - long long holeend = - (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (holeend & ~(long long)ULONG_MAX) - hlen = ULONG_MAX - hba + 1; - } - - details.check_mapping = even_cows? NULL: mapping; - details.nonlinear_vma = NULL; - details.first_index = hba; - details.last_index = hba + hlen - 1; - if (details.last_index < details.first_index) - details.last_index = ULONG_MAX; - details.i_mmap_lock = &mapping->i_mmap_lock; - - spin_lock(&mapping->i_mmap_lock); - - /* Protect against endless unmapping loops */ - mapping->truncate_count++; - if (unlikely(is_restart_addr(mapping->truncate_count))) { - if (mapping->truncate_count == 0) - reset_vma_truncate_counts(mapping); - mapping->truncate_count++; - } - details.truncate_count = mapping->truncate_count; - - if (unlikely(!prio_tree_empty(&mapping->i_mmap))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); - if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) - unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); - spin_unlock(&mapping->i_mmap_lock); -} -EXPORT_SYMBOL(unmap_mapping_range); - -/** - * vmtruncate - unmap mappings "freed" by truncate() syscall - * @inode: inode of the file used - * @offset: file offset to start truncating - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode * inode, loff_t offset) -{ - if (inode->i_size < offset) { - unsigned long limit; - - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - i_size_write(inode, offset); - } else { - struct address_space *mapping = inode->i_mapping; - - /* - * truncation of in-use swapfiles is disallowed - it would - * cause subsequent swapout to scribble on the now-freed - * blocks. - */ - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - i_size_write(inode, offset); - - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - } - - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; - -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; -} -EXPORT_SYMBOL(vmtruncate); - -int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) -{ - struct address_space *mapping = inode->i_mapping; - - /* - * If the underlying filesystem is not going to provide - * a way to truncate a range of blocks (punch a hole) - - * we should return failure right now. - */ - if (!inode->i_op->truncate_range) - return -ENOSYS; - - mutex_lock(&inode->i_mutex); - down_write(&inode->i_alloc_sem); - unmap_mapping_range(mapping, offset, (end - offset), 1); - truncate_inode_pages_range(mapping, offset, end); - unmap_mapping_range(mapping, offset, (end - offset), 1); - inode->i_op->truncate_range(inode, offset, end); - up_write(&inode->i_alloc_sem); - mutex_unlock(&inode->i_mutex); - - return 0; -} - -/* - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) -{ - spinlock_t *ptl; - struct page *page; - swp_entry_t entry; - pte_t pte; - struct mem_cgroup *ptr = NULL; - int ret = 0; - - if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - goto out; - - entry = pte_to_swp_entry(orig_pte); - if (is_migration_entry(entry)) { - migration_entry_wait(mm, pmd, address); - goto out; - } - delayacct_set_flag(DELAYACCT_PF_SWAPIN); - page = lookup_swap_cache(entry); - if (!page) { - grab_swap_token(); /* Contend for token _before_ read-in */ - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, address); - if (!page) { - /* - * Back out if somebody else faulted in this pte - * while we released the pte lock. - */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (likely(pte_same(*page_table, orig_pte))) - ret = VM_FAULT_OOM; - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - goto unlock; - } - - /* Had to read the page from swap area: Major fault */ - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - - mark_page_accessed(page); - - lock_page(page); - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - - if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { - ret = VM_FAULT_OOM; - unlock_page(page); - goto out; - } - - /* - * Back out if somebody else already faulted in this pte. - */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (unlikely(!pte_same(*page_table, orig_pte))) - goto out_nomap; - - if (unlikely(!PageUptodate(page))) { - ret = VM_FAULT_SIGBUS; - goto out_nomap; - } - - /* - * The page isn't present yet, go ahead with the fault. - * - * Be careful about the sequence of operations here. - * To get its accounting right, reuse_swap_page() must be called - * while the page is counted on swap but not yet in mapcount i.e. - * before page_add_anon_rmap() and swap_free(); try_to_free_swap() - * must be called after the swap_free(), or it will never succeed. - * Because delete_from_swap_page() may be called by reuse_swap_page(), - * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry - * in page->private. In this case, a record in swap_cgroup is silently - * discarded at swap_free(). - */ - - inc_mm_counter(mm, anon_rss); - pte = mk_pte(page, vma->vm_page_prot); - if (write_access && reuse_swap_page(page)) { - pte = maybe_mkwrite(pte_mkdirty(pte), vma); - write_access = 0; - } - flush_icache_page(vma, page); - set_pte_at(mm, address, page_table, pte); - page_add_anon_rmap(page, vma, address); - /* It's better to call commit-charge after rmap is established */ - mem_cgroup_commit_charge_swapin(page, ptr); - - swap_free(entry); - if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - try_to_free_swap(page); - unlock_page(page); - - if (write_access) { - ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); - if (ret & VM_FAULT_ERROR) - ret &= VM_FAULT_ERROR; - goto out; - } - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); -unlock: - pte_unmap_unlock(page_table, ptl); -out: - return ret; -out_nomap: - mem_cgroup_cancel_charge_swapin(ptr); - pte_unmap_unlock(page_table, ptl); - unlock_page(page); - page_cache_release(page); - return ret; -} - -/* - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) -{ - struct page *page; - spinlock_t *ptl; - pte_t entry; - - /* Allocate our own private page. */ - pte_unmap(page_table); - - if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_zeroed_user_highpage_movable(vma, address); - if (!page) - goto oom; - __SetPageUptodate(page); - - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) - goto oom_free_page; - - entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!pte_none(*page_table)) - goto release; - inc_mm_counter(mm, anon_rss); - page_add_new_anon_rmap(page, vma, address); - set_pte_at(mm, address, page_table, entry); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, entry); -unlock: - pte_unmap_unlock(page_table, ptl); - return 0; -release: - mem_cgroup_uncharge_page(page); - page_cache_release(page); - goto unlock; -oom_free_page: - page_cache_release(page); -oom: - return VM_FAULT_OOM; -} - -/* - * __do_fault() tries to create a new page mapping. It aggressively - * tries to share with existing pages, but makes a separate copy if - * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid - * the next page fault. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte neither mapped nor locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) -{ - pte_t *page_table; - spinlock_t *ptl; - struct page *page; - pte_t entry; - int anon = 0; - int charged = 0; - struct page *dirty_page = NULL; - struct vm_fault vmf; - int ret; - int page_mkwrite = 0; - - vmf.virtual_address = (void __user *)(address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = flags; - vmf.page = NULL; - - ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) - return ret; - - /* - * For consistency in subsequent calls, make the faulted page always - * locked. - */ - if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf.page); - else - VM_BUG_ON(!PageLocked(vmf.page)); - - /* - * Should we do an early C-O-W break? - */ - page = vmf.page; - if (flags & FAULT_FLAG_WRITE) { - if (!(vma->vm_flags & VM_SHARED)) { - anon = 1; - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; - goto out; - } - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); - if (!page) { - ret = VM_FAULT_OOM; - goto out; - } - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { - ret = VM_FAULT_OOM; - page_cache_release(page); - goto out; - } - charged = 1; - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (vma->vm_flags & VM_LOCKED) - clear_page_mlock(vmf.page); - copy_user_highpage(page, vmf.page, address, vma); - __SetPageUptodate(page); - } else { - /* - * If the page will be shareable, see if the backing - * address space wants to know that the page is about - * to become writable - */ - if (vma->vm_ops->page_mkwrite) { - unlock_page(page); - if (vma->vm_ops->page_mkwrite(vma, page) < 0) { - ret = VM_FAULT_SIGBUS; - anon = 1; /* no anon but release vmf.page */ - goto out_unlocked; - } - lock_page(page); - /* - * XXX: this is not quite right (racy vs - * invalidate) to unlock and relock the page - * like this, however a better fix requires - * reworking page_mkwrite locking API, which - * is better done later. - */ - if (!page->mapping) { - ret = 0; - anon = 1; /* no anon but release vmf.page */ - goto out; - } - page_mkwrite = 1; - } - } - - } - - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - - /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if write_access is true, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. - */ - /* Only go through if we didn't race with anybody else... */ - if (likely(pte_same(*page_table, orig_pte))) { - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); - if (flags & FAULT_FLAG_WRITE) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (anon) { - inc_mm_counter(mm, anon_rss); - page_add_new_anon_rmap(page, vma, address); - } else { - inc_mm_counter(mm, file_rss); - page_add_file_rmap(page); - if (flags & FAULT_FLAG_WRITE) { - dirty_page = page; - get_page(dirty_page); - } - } - set_pte_at(mm, address, page_table, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, entry); - } else { - if (charged) - mem_cgroup_uncharge_page(page); - if (anon) - page_cache_release(page); - else - anon = 1; /* no anon but release faulted_page */ - } - - pte_unmap_unlock(page_table, ptl); - -out: - unlock_page(vmf.page); -out_unlocked: - if (anon) - page_cache_release(vmf.page); - else if (dirty_page) { - if (vma->vm_file) - file_update_time(vma->vm_file); - - set_page_dirty_balance(dirty_page, page_mkwrite); - put_page(dirty_page); - } - - return ret; -} - -static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) -{ - pgoff_t pgoff = (((address & PAGE_MASK) - - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); - - pte_unmap(page_table); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); -} - -/* - * Fault of a previously existing named mapping. Repopulate the pte - * from the encoded file_pte if possible. This enables swappable - * nonlinear vmas. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access, pte_t orig_pte) -{ - unsigned int flags = FAULT_FLAG_NONLINEAR | - (write_access ? FAULT_FLAG_WRITE : 0); - pgoff_t pgoff; - - if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - return 0; - - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { - /* - * Page table corrupted: show pte and kill process. - */ - print_bad_pte(vma, address, orig_pte, NULL); - return VM_FAULT_OOM; - } - - pgoff = pte_to_pgoff(orig_pte); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); -} - -/* - * These routines also need to handle stuff like marking pages dirty - * and/or accessed for architectures that don't do it in hardware (most - * RISC architectures). The early dirtying is also good on the i386. - * - * There is also a hook called "update_mmu_cache()" that architectures - * with external mmu caches can use to update those (ie the Sparc or - * PowerPC hashed page tables that act as extended TLBs). - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, int write_access) -{ - pte_t entry; - spinlock_t *ptl; - - entry = *pte; - if (!pte_present(entry)) { - if (pte_none(entry)) { - if (vma->vm_ops) { - if (likely(vma->vm_ops->fault)) - return do_linear_fault(mm, vma, address, - pte, pmd, write_access, entry); - } - return do_anonymous_page(mm, vma, address, - pte, pmd, write_access); - } - if (pte_file(entry)) - return do_nonlinear_fault(mm, vma, address, - pte, pmd, write_access, entry); - return do_swap_page(mm, vma, address, - pte, pmd, write_access, entry); - } - - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - if (unlikely(!pte_same(*pte, entry))) - goto unlock; - if (write_access) { - if (!pte_write(entry)) - return do_wp_page(mm, vma, address, - pte, pmd, ptl, entry); - entry = pte_mkdirty(entry); - } - entry = pte_mkyoung(entry); - if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { - update_mmu_cache(vma, address, entry); - } else { - /* - * This is needed only for protection faults but the arch code - * is not yet telling us if this is a protection fault or not. - * This still avoids useless tlb flushes for .text page faults - * with threads. - */ - if (write_access) - flush_tlb_page(vma, address); - } -unlock: - pte_unmap_unlock(pte, ptl); - return 0; -} - -/* - * By the time we get here, we already hold the mm semaphore - */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - __set_current_state(TASK_RUNNING); - - count_vm_event(PGFAULT); - - if (unlikely(is_vm_hugetlb_page(vma))) - return hugetlb_fault(mm, vma, address, write_access); - - pgd = pgd_offset(mm, address); - pud = pud_alloc(mm, pgd, address); - if (!pud) - return VM_FAULT_OOM; - pmd = pmd_alloc(mm, pud, address); - if (!pmd) - return VM_FAULT_OOM; - pte = pte_alloc_map(mm, pmd, address); - if (!pte) - return VM_FAULT_OOM; - - return handle_pte_fault(mm, vma, address, pte, pmd, write_access); -} - -#ifndef __PAGETABLE_PUD_FOLDED -/* - * Allocate page upper directory. - * We've already handled the fast-path in-line. - */ -int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - pud_t *new = pud_alloc_one(mm, address); - if (!new) - return -ENOMEM; - - smp_wmb(); /* See comment in __pte_alloc */ - - spin_lock(&mm->page_table_lock); - if (pgd_present(*pgd)) /* Another has populated it */ - pud_free(mm, new); - else - pgd_populate(mm, pgd, new); - spin_unlock(&mm->page_table_lock); - return 0; -} -#endif /* __PAGETABLE_PUD_FOLDED */ - -#ifndef __PAGETABLE_PMD_FOLDED -/* - * Allocate page middle directory. - * We've already handled the fast-path in-line. - */ -int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) -{ - pmd_t *new = pmd_alloc_one(mm, address); - if (!new) - return -ENOMEM; - - smp_wmb(); /* See comment in __pte_alloc */ - - spin_lock(&mm->page_table_lock); -#ifndef __ARCH_HAS_4LEVEL_HACK - if (pud_present(*pud)) /* Another has populated it */ - pmd_free(mm, new); - else - pud_populate(mm, pud, new); -#else - if (pgd_present(*pud)) /* Another has populated it */ - pmd_free(mm, new); - else - pgd_populate(mm, pud, new); -#endif /* __ARCH_HAS_4LEVEL_HACK */ - spin_unlock(&mm->page_table_lock); - return 0; -} -#endif /* __PAGETABLE_PMD_FOLDED */ - -int make_pages_present(unsigned long addr, unsigned long end) -{ - int ret, len, write; - struct vm_area_struct * vma; - - vma = find_vma(current->mm, addr); - if (!vma) - return -ENOMEM; - write = (vma->vm_flags & VM_WRITE) != 0; - BUG_ON(addr >= end); - BUG_ON(end > vma->vm_end); - len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; - ret = get_user_pages(current, current->mm, addr, - len, write, 0, NULL, NULL); - if (ret < 0) - return ret; - return ret == len ? 0 : -EFAULT; -} - -#if !defined(__HAVE_ARCH_GATE_AREA) - -#if defined(AT_SYSINFO_EHDR) -static struct vm_area_struct gate_vma; - -static int __init gate_vma_init(void) -{ - gate_vma.vm_mm = NULL; - gate_vma.vm_start = FIXADDR_USER_START; - gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; - gate_vma.vm_page_prot = __P101; - /* - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - gate_vma.vm_flags |= VM_ALWAYSDUMP; - return 0; -} -__initcall(gate_vma_init); -#endif - -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) -{ -#ifdef AT_SYSINFO_EHDR - return &gate_vma; -#else - return NULL; -#endif -} - -int in_gate_area_no_task(unsigned long addr) -{ -#ifdef AT_SYSINFO_EHDR - if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) - return 1; -#endif - return 0; -} - -#endif /* __HAVE_ARCH_GATE_AREA */ - -#ifdef CONFIG_HAVE_IOREMAP_PROT -int follow_phys(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned long *prot, resource_size_t *phys) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - resource_size_t phys_addr = 0; - struct mm_struct *mm = vma->vm_mm; - int ret = -EINVAL; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - goto out; - - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto out; - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto out; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - - /* We cannot handle huge page PFN maps. Luckily they don't exist. */ - if (pmd_huge(*pmd)) - goto out; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!ptep) - goto out; - - pte = *ptep; - if (!pte_present(pte)) - goto unlock; - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - phys_addr = pte_pfn(pte); - phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ - - *prot = pgprot_val(pte_pgprot(pte)); - *phys = phys_addr; - ret = 0; - -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return ret; -} - -int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, - void *buf, int len, int write) -{ - resource_size_t phys_addr; - unsigned long prot = 0; - void __iomem *maddr; - int offset = addr & (PAGE_SIZE-1); - - if (follow_phys(vma, addr, write, &prot, &phys_addr)) - return -EINVAL; - - maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); - if (write) - memcpy_toio(maddr + offset, buf, len); - else - memcpy_fromio(buf, maddr + offset, len); - iounmap(maddr); - - return len; -} -#endif - -/* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages - */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) -{ - struct mm_struct *mm; - struct vm_area_struct *vma; - void *old_buf = buf; - - mm = get_task_mm(tsk); - if (!mm) - return 0; - - down_read(&mm->mmap_sem); - /* ignore errors, just check how much was successfully transferred */ - while (len) { - int bytes, ret, offset; - void *maddr; - struct page *page = NULL; - - ret = get_user_pages(tsk, mm, addr, 1, - write, 1, &page, &vma); - if (ret <= 0) { - /* - * Check if this is a VM_IO | VM_PFNMAP VMA, which - * we can access using slightly different code. - */ -#ifdef CONFIG_HAVE_IOREMAP_PROT - vma = find_vma(mm, addr); - if (!vma) - break; - if (vma->vm_ops && vma->vm_ops->access) - ret = vma->vm_ops->access(vma, addr, buf, - len, write); - if (ret <= 0) -#endif - break; - bytes = ret; - } else { - bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; - - maddr = kmap(page); - if (write) { - copy_to_user_page(vma, page, addr, - maddr + offset, buf, bytes); - set_page_dirty_lock(page); - } else { - copy_from_user_page(vma, page, addr, - buf, maddr + offset, bytes); - } - kunmap(page); - page_cache_release(page); - } - len -= bytes; - buf += bytes; - addr += bytes; - } - up_read(&mm->mmap_sem); - mmput(mm); - - return buf - old_buf; -} - -/* - * Print the name of a VMA. - */ -void print_vma_addr(char *prefix, unsigned long ip) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - - /* - * Do not print if we are in atomic - * contexts (in exception stacks, etc.): - */ - if (preempt_count()) - return; - - down_read(&mm->mmap_sem); - vma = find_vma(mm, ip); - if (vma && vma->vm_file) { - struct file *f = vma->vm_file; - char *buf = (char *)__get_free_page(GFP_KERNEL); - if (buf) { - char *p, *s; - - p = d_path(&f->f_path, buf, PAGE_SIZE); - if (IS_ERR(p)) - p = "?"; - s = strrchr(p, '/'); - if (s) - p = s+1; - printk("%s%s[%lx+%lx]", prefix, p, - vma->vm_start, - vma->vm_end - vma->vm_start); - free_page((unsigned long)buf); - } - } - up_read(¤t->mm->mmap_sem); -} - -#ifdef CONFIG_PROVE_LOCKING -void might_fault(void) -{ - /* - * Some code (nfs/sunrpc) uses socket ops on kernel memory while - * holding the mmap_sem, this is safe because kernel memory doesn't - * get paged out, therefore we'll never actually fault, and the - * below annotations will generate false positives. - */ - if (segment_eq(get_fs(), KERNEL_DS)) - return; - - might_sleep(); - /* - * it would be nicer only to annotate paths which are not under - * pagefault_disable, however that requires a larger audit and - * providing helpers like get_user_atomic. - */ - if (!in_atomic() && current->mm) - might_lock_read(¤t->mm->mmap_sem); -} -EXPORT_SYMBOL(might_fault); -#endif -#endif /* DDE_LINUX */ diff --git a/libdde_linux26/lib/src/mm/.svn/text-base/page-writeback.c.svn-base b/libdde_linux26/lib/src/mm/.svn/text-base/page-writeback.c.svn-base deleted file mode 100644 index 8a325e2a..00000000 --- a/libdde_linux26/lib/src/mm/.svn/text-base/page-writeback.c.svn-base +++ /dev/null @@ -1,1468 +0,0 @@ -/* - * mm/page-writeback.c - * - * Copyright (C) 2002, Linus Torvalds. - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> - * - * Contains functions related to writing back dirty pages at the - * address_space level. - * - * 10Apr2002 Andrew Morton - * Initial version - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/writeback.h> -#include <linux/init.h> -#include <linux/backing-dev.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/blkdev.h> -#include <linux/mpage.h> -#include <linux/rmap.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/smp.h> -#include <linux/sysctl.h> -#include <linux/cpu.h> -#include <linux/syscalls.h> -#include <linux/buffer_head.h> -#include <linux/pagevec.h> - -/* - * The maximum number of pages to writeout in a single bdflush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 - -/* - * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited - * will look to see if it needs to force writeback or throttling. - */ -static long ratelimit_pages = 32; - -/* - * When balance_dirty_pages decides that the caller needs to perform some - * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably - * large amounts of I/O are submitted. - */ -static inline long sync_writeback_pages(void) -{ - return ratelimit_pages + ratelimit_pages / 2; -} - -/* The following parameters are exported via /proc/sys/vm */ - -/* - * Start background writeback (via pdflush) at this percentage - */ -int dirty_background_ratio = 5; - -/* - * dirty_background_bytes starts at 0 (disabled) so that it is a function of - * dirty_background_ratio * the amount of dirtyable memory - */ -unsigned long dirty_background_bytes; - -/* - * free highmem will not be subtracted from the total free memory - * for calculating free ratios if vm_highmem_is_dirtyable is true - */ -int vm_highmem_is_dirtyable; - -/* - * The generator of dirty data starts writeback at this percentage - */ -int vm_dirty_ratio = 10; - -/* - * vm_dirty_bytes starts at 0 (disabled) so that it is a function of - * vm_dirty_ratio * the amount of dirtyable memory - */ -unsigned long vm_dirty_bytes; - -/* - * The interval between `kupdate'-style writebacks, in jiffies - */ -#ifndef DDE_LINUX -int dirty_writeback_interval = 5 * HZ; -#else -int dirty_writeback_interval = 1250; -#endif - -#ifndef DDE_LINUX -/* - * The longest number of jiffies for which data is allowed to remain dirty - */ -int dirty_expire_interval = 30 * HZ; -#else -int dirty_expire_interval = 7500; -#endif - -/* - * Flag that makes the machine dump writes/reads and block dirtyings. - */ -int block_dump; - -/* - * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: - * a full sync is triggered after this time elapses without any disk activity. - */ -int laptop_mode; - -EXPORT_SYMBOL(laptop_mode); - -/* End of sysctl-exported parameters */ - - -static void background_writeout(unsigned long _min_pages); - -/* - * Scale the writeback cache size proportional to the relative writeout speeds. - * - * We do this by keeping a floating proportion between BDIs, based on page - * writeback completions [end_page_writeback()]. Those devices that write out - * pages fastest will get the larger share, while the slower will get a smaller - * share. - * - * We use page writeout completions because we are interested in getting rid of - * dirty pages. Having them written out is the primary goal. - * - * We introduce a concept of time, a period over which we measure these events, - * because demand can/will vary over time. The length of this period itself is - * measured in page writeback completions. - * - */ -static struct prop_descriptor vm_completions; -static struct prop_descriptor vm_dirties; - -/* - * couple the period to the dirty_ratio: - * - * period/2 ~ roundup_pow_of_two(dirty limit) - */ -static int calc_period_shift(void) -{ - unsigned long dirty_total; - - if (vm_dirty_bytes) - dirty_total = vm_dirty_bytes / PAGE_SIZE; - else - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / - 100; - return 2 + ilog2(dirty_total - 1); -} - -/* - * update the period when the dirty threshold changes. - */ -static void update_completion_period(void) -{ - int shift = calc_period_shift(); - prop_change_shift(&vm_completions, shift); - prop_change_shift(&vm_dirties, shift); -} - -int dirty_background_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); - if (ret == 0 && write) - dirty_background_bytes = 0; - return ret; -} - -int dirty_background_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); - if (ret == 0 && write) - dirty_background_ratio = 0; - return ret; -} - -int dirty_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int old_ratio = vm_dirty_ratio; - int ret; - - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); - if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - update_completion_period(); - vm_dirty_bytes = 0; - } - return ret; -} - - -int dirty_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - unsigned long old_bytes = vm_dirty_bytes; - int ret; - - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); - if (ret == 0 && write && vm_dirty_bytes != old_bytes) { - update_completion_period(); - vm_dirty_ratio = 0; - } - return ret; -} - -/* - * Increment the BDI's writeout completion count and the global writeout - * completion count. Called from test_clear_page_writeback(). - */ -static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) -{ - __prop_inc_percpu_max(&vm_completions, &bdi->completions, - bdi->max_prop_frac); -} - -void bdi_writeout_inc(struct backing_dev_info *bdi) -{ - unsigned long flags; - - local_irq_save(flags); - __bdi_writeout_inc(bdi); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(bdi_writeout_inc); - -void task_dirty_inc(struct task_struct *tsk) -{ - prop_inc_single(&vm_dirties, &tsk->dirties); -} - -/* - * Obtain an accurate fraction of the BDI's portion. - */ -static void bdi_writeout_fraction(struct backing_dev_info *bdi, - long *numerator, long *denominator) -{ - if (bdi_cap_writeback_dirty(bdi)) { - prop_fraction_percpu(&vm_completions, &bdi->completions, - numerator, denominator); - } else { - *numerator = 0; - *denominator = 1; - } -} - -/* - * Clip the earned share of dirty pages to that which is actually available. - * This avoids exceeding the total dirty_limit when the floating averages - * fluctuate too quickly. - */ -static void -clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) -{ - long avail_dirty; - - avail_dirty = dirty - - (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_WRITEBACK) + - global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK_TEMP)); - - if (avail_dirty < 0) - avail_dirty = 0; - - avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + - bdi_stat(bdi, BDI_WRITEBACK); - - *pbdi_dirty = min(*pbdi_dirty, avail_dirty); -} - -static inline void task_dirties_fraction(struct task_struct *tsk, - long *numerator, long *denominator) -{ - prop_fraction_single(&vm_dirties, &tsk->dirties, - numerator, denominator); -} - -/* - * scale the dirty limit - * - * task specific dirty limit: - * - * dirty -= (dirty/8) * p_{t} - */ -static void task_dirty_limit(struct task_struct *tsk, long *pdirty) -{ - long numerator, denominator; - long dirty = *pdirty; - u64 inv = dirty >> 3; - - task_dirties_fraction(tsk, &numerator, &denominator); - inv *= numerator; - do_div(inv, denominator); - - dirty -= inv; - if (dirty < *pdirty/2) - dirty = *pdirty/2; - - *pdirty = dirty; -} - -/* - * - */ -static DEFINE_SPINLOCK(bdi_lock); -static unsigned int bdi_min_ratio; - -int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) -{ - int ret = 0; - unsigned long flags; - - spin_lock_irqsave(&bdi_lock, flags); - if (min_ratio > bdi->max_ratio) { - ret = -EINVAL; - } else { - min_ratio -= bdi->min_ratio; - if (bdi_min_ratio + min_ratio < 100) { - bdi_min_ratio += min_ratio; - bdi->min_ratio += min_ratio; - } else { - ret = -EINVAL; - } - } - spin_unlock_irqrestore(&bdi_lock, flags); - - return ret; -} - -int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) -{ - unsigned long flags; - int ret = 0; - - if (max_ratio > 100) - return -EINVAL; - - spin_lock_irqsave(&bdi_lock, flags); - if (bdi->min_ratio > max_ratio) { - ret = -EINVAL; - } else { - bdi->max_ratio = max_ratio; - bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; - } - spin_unlock_irqrestore(&bdi_lock, flags); - - return ret; -} -EXPORT_SYMBOL(bdi_set_max_ratio); - -/* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ - -static unsigned long highmem_dirtyable_memory(unsigned long total) -{ -#ifdef CONFIG_HIGHMEM - int node; - unsigned long x = 0; - - for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = - &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - - x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); - } - /* - * Make sure that the number of highmem pages is never larger - * than the number of the total dirtyable memory. This can only - * occur in very strange VM situations but we want to make sure - * that this does not occur. - */ - return min(x, total); -#else - return 0; -#endif -} - -/** - * determine_dirtyable_memory - amount of memory that may be used - * - * Returns the numebr of pages that can currently be freed and used - * by the kernel for direct mappings. - */ -unsigned long determine_dirtyable_memory(void) -{ - unsigned long x; - - x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); - - if (!vm_highmem_is_dirtyable) - x -= highmem_dirtyable_memory(x); - - return x + 1; /* Ensure that we never return 0 */ -} - -void -get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, - unsigned long *pbdi_dirty, struct backing_dev_info *bdi) -{ - unsigned long background; - unsigned long dirty; - unsigned long available_memory = determine_dirtyable_memory(); - struct task_struct *tsk; - - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else { - int dirty_ratio; - - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; - dirty = (dirty_ratio * available_memory) / 100; - } - - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); - else - background = (dirty_background_ratio * available_memory) / 100; - - if (background >= dirty) - background = dirty / 2; - tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; - } - *pbackground = background; - *pdirty = dirty; - - if (bdi) { - u64 bdi_dirty; - long numerator, denominator; - - /* - * Calculate this BDI's share of the dirty ratio. - */ - bdi_writeout_fraction(bdi, &numerator, &denominator); - - bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; - bdi_dirty *= numerator; - do_div(bdi_dirty, denominator); - bdi_dirty += (dirty * bdi->min_ratio) / 100; - if (bdi_dirty > (dirty * bdi->max_ratio) / 100) - bdi_dirty = dirty * bdi->max_ratio / 100; - - *pbdi_dirty = bdi_dirty; - clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); - task_dirty_limit(current, pbdi_dirty); - } -} - -/* - * balance_dirty_pages() must be called by processes which are generating dirty - * data. It looks at the number of dirty pages in the machine and will force - * the caller to perform writeback if the system is over `vm_dirty_ratio'. - * If we're over `background_thresh' then pdflush is woken to perform some - * writeout. - */ -static void balance_dirty_pages(struct address_space *mapping) -{ - long nr_reclaimable, bdi_nr_reclaimable; - long nr_writeback, bdi_nr_writeback; - unsigned long background_thresh; - unsigned long dirty_thresh; - unsigned long bdi_thresh; - unsigned long pages_written = 0; - unsigned long write_chunk = sync_writeback_pages(); - - struct backing_dev_info *bdi = mapping->backing_dev_info; - - for (;;) { - struct writeback_control wbc = { - .bdi = bdi, - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = write_chunk, - .range_cyclic = 1, - }; - - get_dirty_limits(&background_thresh, &dirty_thresh, - &bdi_thresh, bdi); - - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - nr_writeback = global_page_state(NR_WRITEBACK); - - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); - - if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) - break; - - /* - * Throttle it only when the background writeback cannot - * catch-up. This avoids (excessively) small writeouts - * when the bdi limits are ramping up. - */ - if (nr_reclaimable + nr_writeback < - (background_thresh + dirty_thresh) / 2) - break; - - if (!bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; - - /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - */ - if (bdi_nr_reclaimable) { - writeback_inodes(&wbc); - pages_written += write_chunk - wbc.nr_to_write; - get_dirty_limits(&background_thresh, &dirty_thresh, - &bdi_thresh, bdi); - } - - /* - * In order to avoid the stacked BDI deadlock we need - * to ensure we accurately count the 'dirty' pages when - * the threshold is low. - * - * Otherwise it would be possible to get thresh+n pages - * reported dirty, even though there are thresh-m pages - * actually dirty; with m+n sitting in the percpu - * deltas. - */ - if (bdi_thresh < 2*bdi_stat_error(bdi)) { - bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); - } else if (bdi_nr_reclaimable) { - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); - } - - if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) - break; - if (pages_written >= write_chunk) - break; /* We've done our duty */ - - congestion_wait(WRITE, HZ/10); - } - - if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && - bdi->dirty_exceeded) - bdi->dirty_exceeded = 0; - - if (writeback_in_progress(bdi)) - return; /* pdflush is already working this queue */ - - /* - * In laptop mode, we wait until hitting the higher threshold before - * starting background writeout, and then write out all the way down - * to the lower threshold. So slow writers cause minimal disk activity. - * - * In normal mode, we start background writeout at the lower - * background_thresh, to keep the amount of dirty memory low. - */ - if ((laptop_mode && pages_written) || - (!laptop_mode && (global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS) - > background_thresh))) - pdflush_operation(background_writeout, 0); -} - -void set_page_dirty_balance(struct page *page, int page_mkwrite) -{ - if (set_page_dirty(page) || page_mkwrite) { - struct address_space *mapping = page_mapping(page); - - if (mapping) - balance_dirty_pages_ratelimited(mapping); - } -} - -/** - * balance_dirty_pages_ratelimited_nr - balance dirty memory state - * @mapping: address_space which was dirtied - * @nr_pages_dirtied: number of pages which the caller has just dirtied - * - * Processes which are dirtying memory should call in here once for each page - * which was newly dirtied. The function will periodically check the system's - * dirty state and will initiate writeback if needed. - * - * On really big machines, get_writeback_state is expensive, so try to avoid - * calling it too often (ratelimiting). But once we're over the dirty memory - * limit we decrease the ratelimiting by a lot, to prevent individual processes - * from overshooting the limit by (ratelimit_pages) each. - */ -void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, - unsigned long nr_pages_dirtied) -{ - static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; - unsigned long ratelimit; - unsigned long *p; - - ratelimit = ratelimit_pages; - if (mapping->backing_dev_info->dirty_exceeded) - ratelimit = 8; - - /* - * Check the rate limiting. Also, we do not want to throttle real-time - * tasks in balance_dirty_pages(). Period. - */ - preempt_disable(); - p = &__get_cpu_var(ratelimits); - *p += nr_pages_dirtied; - if (unlikely(*p >= ratelimit)) { - *p = 0; - preempt_enable(); - balance_dirty_pages(mapping); - return; - } - preempt_enable(); -} -EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); - -void throttle_vm_writeout(gfp_t gfp_mask) -{ - unsigned long background_thresh; - unsigned long dirty_thresh; - - for ( ; ; ) { - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); - - /* - * Boost the allowable dirty threshold a bit for page - * allocators so they don't get DoS'ed by heavy writers - */ - dirty_thresh += dirty_thresh / 10; /* wheeee... */ - - if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) <= dirty_thresh) - break; - congestion_wait(WRITE, HZ/10); - - /* - * The caller might hold locks which can prevent IO completion - * or progress in the filesystem. So we cannot just sit here - * waiting for IO to complete. - */ - if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) - break; - } -} - -/* - * writeback at least _min_pages, and keep writing until the amount of dirty - * memory is less than the background threshold, or until we're all clean. - */ -static void background_writeout(unsigned long _min_pages) -{ - long min_pages = _min_pages; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = 0, - .nonblocking = 1, - .range_cyclic = 1, - }; - - for ( ; ; ) { - unsigned long background_thresh; - unsigned long dirty_thresh; - - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) < background_thresh - && min_pages <= 0) - break; - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - wbc.pages_skipped = 0; - writeback_inodes(&wbc); - min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { - /* Wrote less than expected */ - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; - } - } -} - -/* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. Returns 0 if a pdflush thread was dispatched. Returns - * -1 if all pdflush threads were busy. - */ -int wakeup_pdflush(long nr_pages) -{ - if (nr_pages == 0) - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - return pdflush_operation(background_writeout, nr_pages); -} - -#ifndef DDE_LINUX -static void wb_timer_fn(unsigned long unused); -static void laptop_timer_fn(unsigned long unused); - -static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); -static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); - -/* - * Periodic writeback of "old" data. - * - * Define "old": the first time one of an inode's pages is dirtied, we mark the - * dirtying-time in the inode's address_space. So this periodic writeback code - * just walks the superblock inode list, writing back any inodes which are - * older than a specific point in time. - * - * Try to run once per dirty_writeback_interval. But if a writeback event - * takes longer than a dirty_writeback_interval interval, then leave a - * one-second gap. - * - * older_than_this takes precedence over nr_to_write. So we'll only write back - * all dirty pages if they are all attached to "old" mappings. - */ -static void wb_kupdate(unsigned long arg) -{ - unsigned long oldest_jif; - unsigned long start_jif; - unsigned long next_jif; - long nr_to_write; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = &oldest_jif, - .nr_to_write = 0, - .nonblocking = 1, - .for_kupdate = 1, - .range_cyclic = 1, - }; - - sync_supers(); - - oldest_jif = jiffies - dirty_expire_interval; - start_jif = jiffies; - next_jif = start_jif + dirty_writeback_interval; - nr_to_write = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - while (nr_to_write > 0) { - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - writeback_inodes(&wbc); - if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; /* All the old data is written */ - } - nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - } - if (time_before(next_jif, jiffies + HZ)) - next_jif = jiffies + HZ; - if (dirty_writeback_interval) - mod_timer(&wb_timer, next_jif); -} - -/* - * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs - */ -int dirty_writeback_centisecs_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -{ - proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); - if (dirty_writeback_interval) - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); - else - del_timer(&wb_timer); - return 0; -} - -static void wb_timer_fn(unsigned long unused) -{ - if (pdflush_operation(wb_kupdate, 0) < 0) - mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ -} - -static void laptop_flush(unsigned long unused) -{ - sys_sync(); -} - -static void laptop_timer_fn(unsigned long unused) -{ - pdflush_operation(laptop_flush, 0); -} - -/* - * We've spun up the disk and we're in laptop mode: schedule writeback - * of all dirty data a few seconds from now. If the flush is already scheduled - * then push it back - the user is still using the disk. - */ -void laptop_io_completion(void) -{ - mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); -} - -/* - * We're in laptop mode and we've just synced. The sync's writes will have - * caused another writeback to be scheduled by laptop_io_completion. - * Nothing needs to be written back anymore, so we unschedule the writeback. - */ -void laptop_sync_completion(void) -{ - del_timer(&laptop_mode_wb_timer); -} -#endif - -/* - * If ratelimit_pages is too high then we can get into dirty-data overload - * if a large number of processes all perform writes at the same time. - * If it is too low then SMP machines will call the (expensive) - * get_writeback_state too often. - * - * Here we set ratelimit_pages to a level which ensures that when all CPUs are - * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory - * thresholds before writeback cuts in. - * - * But the limit should not be set too high. Because it also controls the - * amount of memory which the balance_dirty_pages() caller has to write back. - * If this is too large then the caller will block on the IO queue all the - * time. So limit it to four megabytes - the balance_dirty_pages() caller - * will write six megabyte chunks, max. - */ - -void writeback_set_ratelimit(void) -{ - ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); - if (ratelimit_pages < 16) - ratelimit_pages = 16; - if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) - ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; -} - -static int __cpuinit -ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) -{ - writeback_set_ratelimit(); - return NOTIFY_DONE; -} - -static struct notifier_block __cpuinitdata ratelimit_nb = { - .notifier_call = ratelimit_handler, - .next = NULL, -}; - -/* - * Called early on to tune the page writeback dirty limits. - * - * We used to scale dirty pages according to how total memory - * related to pages that could be allocated for buffers (by - * comparing nr_free_buffer_pages() to vm_total_pages. - * - * However, that was when we used "dirty_ratio" to scale with - * all memory, and we don't do that any more. "dirty_ratio" - * is now applied to total non-HIGHPAGE memory (by subtracting - * totalhigh_pages from vm_total_pages), and as such we can't - * get into the old insane situation any more where we had - * large amounts of dirty pages compared to a small amount of - * non-HIGHMEM memory. - * - * But we might still want to scale the dirty_ratio by how - * much memory the box has.. - */ -void __init page_writeback_init(void) -{ - int shift; - -#ifndef DDE_LINUX - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); -#endif - writeback_set_ratelimit(); - register_cpu_notifier(&ratelimit_nb); - - shift = calc_period_shift(); - prop_descriptor_init(&vm_completions, shift); - prop_descriptor_init(&vm_dirties, shift); -} - -/** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * If a page is already under I/O, write_cache_pages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. - */ -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) -{ - struct backing_dev_info *bdi = mapping->backing_dev_info; - int ret = 0; - int done = 0; - struct pagevec pvec; - int nr_pages; - pgoff_t uninitialized_var(writeback_index); - pgoff_t index; - pgoff_t end; /* Inclusive */ - pgoff_t done_index; - int cycled; - int range_whole = 0; - long nr_to_write = wbc->nr_to_write; - - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - - pagevec_init(&pvec, 0); - if (wbc->range_cyclic) { - writeback_index = mapping->writeback_index; /* prev offset */ - index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; - end = -1; - } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ - } -retry: - done_index = index; - while (!done && (index <= end)) { - int i; - - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - done = 1; - break; - } - - done_index = page->index + 1; - - lock_page(page); - - /* - * Page truncated or invalidated. We can freely skip it - * then, even for data integrity operations: the page - * has disappeared concurrently, so there could be no - * real expectation of this data interity operation - * even if there is now a new, dirty page at the same - * pagecache address. - */ - if (unlikely(page->mapping != mapping)) { -continue_unlock: - unlock_page(page); - continue; - } - - if (!PageDirty(page)) { - /* someone wrote it for us */ - goto continue_unlock; - } - - if (PageWriteback(page)) { - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - else - goto continue_unlock; - } - - BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) - goto continue_unlock; - - ret = (*writepage)(page, wbc, data); - if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - ret = 0; - } else { - /* - * done_index is set past this page, - * so media errors will not choke - * background writeout for the entire - * file. This has consequences for - * range_cyclic semantics (ie. it may - * not be suitable for data integrity - * writeout). - */ - done = 1; - break; - } - } - - if (nr_to_write > 0) { - nr_to_write--; - if (nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) { - /* - * We stop writing back only if we are - * not doing integrity sync. In case of - * integrity sync we have to keep going - * because someone may be concurrently - * dirtying pages, and we might have - * synced a lot of newly appeared dirty - * pages, but have not synced all of the - * old dirty pages. - */ - done = 1; - break; - } - } - - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; - break; - } - } - pagevec_release(&pvec); - cond_resched(); - } - if (!cycled && !done) { - /* - * range_cyclic: - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - cycled = 1; - index = 0; - end = writeback_index - 1; - goto retry; - } - if (!wbc->no_nrwrite_index_update) { - if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) - mapping->writeback_index = done_index; - wbc->nr_to_write = nr_to_write; - } - - return ret; -} -EXPORT_SYMBOL(write_cache_pages); - -#ifndef DDE_LINUX -/* - * Function used by generic_writepages to call the real writepage - * function and set the mapping flags on error - */ -static int __writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); - mapping_set_error(mapping, ret); - return ret; -} - -/** - * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * - * This is a library function, which implements the writepages() - * address_space_operation. - */ -int generic_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - /* deal with chardevs and other special file */ - if (!mapping->a_ops->writepage) - return 0; - - return write_cache_pages(mapping, wbc, __writepage, mapping); -} - -EXPORT_SYMBOL(generic_writepages); - -int do_writepages(struct address_space *mapping, struct writeback_control *wbc) -{ - int ret; - - if (wbc->nr_to_write <= 0) - return 0; - wbc->for_writepages = 1; - if (mapping->a_ops->writepages) - ret = mapping->a_ops->writepages(mapping, wbc); - else - ret = generic_writepages(mapping, wbc); - wbc->for_writepages = 0; - return ret; -} - -/** - * write_one_page - write out a single page and optionally wait on I/O - * @page: the page to write - * @wait: if true, wait on writeout - * - * The page must be locked by the caller and will be unlocked upon return. - * - * write_one_page() returns a negative error code if I/O failed. - */ -int write_one_page(struct page *page, int wait) -{ - struct address_space *mapping = page->mapping; - int ret = 0; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, - }; - - BUG_ON(!PageLocked(page)); - - if (wait) - wait_on_page_writeback(page); - - if (clear_page_dirty_for_io(page)) { - page_cache_get(page); - ret = mapping->a_ops->writepage(page, &wbc); - if (ret == 0 && wait) { - wait_on_page_writeback(page); - if (PageError(page)) - ret = -EIO; - } - page_cache_release(page); - } else { - unlock_page(page); - } - return ret; -} -EXPORT_SYMBOL(write_one_page); - -/* - * For address_spaces which do not use buffers nor write back. - */ -int __set_page_dirty_no_writeback(struct page *page) -{ - if (!PageDirty(page)) - SetPageDirty(page); - return 0; -} - -/* - * For address_spaces which do not use buffers. Just tag the page as dirty in - * its radix tree. - * - * This is also used when a single buffer is being dirtied: we want to set the - * page dirty in that case, but not all the buffers. This is a "bottom-up" - * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. - * - * Most callers have locked the page, which pins the address_space in memory. - * But zap_pte_range() does not lock the page, however in that case the - * mapping is pinned by the vma's ->vm_file reference. - * - * We take care to handle the case where the page was truncated from the - * mapping by re-checking page_mapping() inside tree_lock. - */ -int __set_page_dirty_nobuffers(struct page *page) -{ - if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); - struct address_space *mapping2; - - if (!mapping) - return 1; - - spin_lock_irq(&mapping->tree_lock); - mapping2 = page_mapping(page); - if (mapping2) { /* Race with truncate? */ - BUG_ON(mapping2 != mapping); - WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_dirty_inc(current); - task_io_account_write(PAGE_CACHE_SIZE); - } - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - spin_unlock_irq(&mapping->tree_lock); - if (mapping->host) { - /* !PageAnon && !swapper_space */ - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - } - return 1; - } - return 0; -} -EXPORT_SYMBOL(__set_page_dirty_nobuffers); - -/* - * When a writepage implementation decides that it doesn't want to write this - * page for some reason, it should redirty the locked page via - * redirty_page_for_writepage() and it should then unlock the page and return 0 - */ -int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) -{ - wbc->pages_skipped++; - return __set_page_dirty_nobuffers(page); -} -EXPORT_SYMBOL(redirty_page_for_writepage); - -/* - * If the mapping doesn't provide a set_page_dirty a_op, then - * just fall through and assume that it wants buffer_heads. - */ -int set_page_dirty(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - - if (likely(mapping)) { - int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; -#ifdef CONFIG_BLOCK - if (!spd) - spd = __set_page_dirty_buffers; -#endif - return (*spd)(page); - } - if (!PageDirty(page)) { - if (!TestSetPageDirty(page)) - return 1; - } - return 0; -} -EXPORT_SYMBOL(set_page_dirty); - -/* - * set_page_dirty() is racy if the caller has no reference against - * page->mapping->host, and if the page is unlocked. This is because another - * CPU could truncate the page off the mapping and then free the mapping. - * - * Usually, the page _is_ locked, or the caller is a user-space process which - * holds a reference on the inode by having an open file. - * - * In other cases, the page should be locked before running set_page_dirty(). - */ -int set_page_dirty_lock(struct page *page) -{ - int ret; - - lock_page_nosync(page); - ret = set_page_dirty(page); - unlock_page(page); - return ret; -} -EXPORT_SYMBOL(set_page_dirty_lock); -#endif - -/* - * Clear a page's dirty flag, while caring for dirty memory accounting. - * Returns true if the page was previously dirty. - * - * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the radix tree so that a concurrent write-for-sync - * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage - * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and radix-tree dirty tag - * back into sync. - * - * This incoherency between the page's dirty flag and radix-tree tag is - * unfortunate, but it only exists while the page is locked. - */ -int clear_page_dirty_for_io(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - - BUG_ON(!PageLocked(page)); - - ClearPageReclaim(page); - if (mapping && mapping_cap_account_dirty(mapping)) { - /* - * Yes, Virginia, this is indeed insane. - * - * We use this sequence to make sure that - * (a) we account for dirty stats properly - * (b) we tell the low-level filesystem to - * mark the whole page dirty if it was - * dirty in a pagetable. Only to then - * (c) clean the page again and return 1 to - * cause the writeback. - * - * This way we avoid all nasty races with the - * dirty bit in multiple places and clearing - * them concurrently from different threads. - * - * Note! Normally the "set_page_dirty(page)" - * has no effect on the actual dirty bit - since - * that will already usually be set. But we - * need the side effects, and it can help us - * avoid races. - * - * We basically use the page "master dirty bit" - * as a serialization point for all the different - * threads doing their things. - */ - if (page_mkclean(page)) - set_page_dirty(page); - /* - * We carefully synchronise fault handlers against - * installing a dirty pte and marking the page dirty - * at this point. We do this by having them hold the - * page lock at some point after installing their - * pte, but before marking the page dirty. - * Pages are always locked coming in here, so we get - * the desired exclusion. See mm/memory.c:do_wp_page() - * for more comments. - */ - if (TestClearPageDirty(page)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - return 1; - } - return 0; - } - return TestClearPageDirty(page); -} -EXPORT_SYMBOL(clear_page_dirty_for_io); - -int test_clear_page_writeback(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - int ret; - - if (mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; - unsigned long flags; - - spin_lock_irqsave(&mapping->tree_lock, flags); - ret = TestClearPageWriteback(page); - if (ret) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_WRITEBACK); - if (bdi_cap_account_writeback(bdi)) { - __dec_bdi_stat(bdi, BDI_WRITEBACK); - __bdi_writeout_inc(bdi); - } - } - spin_unlock_irqrestore(&mapping->tree_lock, flags); - } else { - ret = TestClearPageWriteback(page); - } - if (ret) - dec_zone_page_state(page, NR_WRITEBACK); - return ret; -} - -#ifndef DDE_LINUX -int test_set_page_writeback(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - int ret; - - if (mapping) { - struct backing_dev_info *bdi = mapping->backing_dev_info; - unsigned long flags; - - spin_lock_irqsave(&mapping->tree_lock, flags); - ret = TestSetPageWriteback(page); - if (!ret) { - radix_tree_tag_set(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_WRITEBACK); - if (bdi_cap_account_writeback(bdi)) - __inc_bdi_stat(bdi, BDI_WRITEBACK); - } - if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); - } else { - ret = TestSetPageWriteback(page); - } - if (!ret) - inc_zone_page_state(page, NR_WRITEBACK); - return ret; - -} -EXPORT_SYMBOL(test_set_page_writeback); -#endif /* DDE_LINUX */ - -/* - * Return true if any of the pages in the mapping are marked with the - * passed tag. - */ -int mapping_tagged(struct address_space *mapping, int tag) -{ - int ret; - rcu_read_lock(); - ret = radix_tree_tagged(&mapping->page_tree, tag); - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL(mapping_tagged); diff --git a/libdde_linux26/lib/src/net/.svn/all-wcprops b/libdde_linux26/lib/src/net/.svn/all-wcprops deleted file mode 100644 index 194a3d0f..00000000 --- a/libdde_linux26/lib/src/net/.svn/all-wcprops +++ /dev/null @@ -1,5 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 62 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net -END diff --git a/libdde_linux26/lib/src/net/.svn/entries b/libdde_linux26/lib/src/net/.svn/entries deleted file mode 100644 index 7f8b56ad..00000000 --- a/libdde_linux26/lib/src/net/.svn/entries +++ /dev/null @@ -1,37 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/net -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -netlink -dir - -sched -dir - -core -dir - diff --git a/libdde_linux26/lib/src/net/.svn/format b/libdde_linux26/lib/src/net/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/net/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/net/core/.svn/all-wcprops b/libdde_linux26/lib/src/net/core/.svn/all-wcprops deleted file mode 100644 index 5256bdd2..00000000 --- a/libdde_linux26/lib/src/net/core/.svn/all-wcprops +++ /dev/null @@ -1,41 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 67 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net/core -END -dev.c -K 25 -svn:wc:ra_dav:version-url -V 73 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net/core/dev.c -END -skbuff.c -K 25 -svn:wc:ra_dav:version-url -V 76 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net/core/skbuff.c -END -utils.c -K 25 -svn:wc:ra_dav:version-url -V 75 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net/core/utils.c -END -net_namespace.c -K 25 -svn:wc:ra_dav:version-url -V 83 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net/core/net_namespace.c -END -link_watch.c -K 25 -svn:wc:ra_dav:version-url -V 80 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net/core/link_watch.c -END -rtnetlink.c -K 25 -svn:wc:ra_dav:version-url -V 79 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net/core/rtnetlink.c -END diff --git a/libdde_linux26/lib/src/net/core/.svn/entries b/libdde_linux26/lib/src/net/core/.svn/entries deleted file mode 100644 index 239fe3c4..00000000 --- a/libdde_linux26/lib/src/net/core/.svn/entries +++ /dev/null @@ -1,232 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/net/core -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -dev.c -file - - - - -2009-11-15T17:17:09.000000Z -b222712999c467e977b322909437aee7 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -130218 - -skbuff.c -file - - - - -2009-11-15T17:17:09.000000Z -47bfe40a5f38451bbe49994d591986f0 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -73586 - -utils.c -file - - - - -2009-11-15T17:17:09.000000Z -b34eb3c7def5a58f3e0d50edb224e95e -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -6555 - -net_namespace.c -file - - - - -2009-11-15T17:17:09.000000Z -b2faf73d97de3c79bef0a83e5884a648 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -11707 - -link_watch.c -file - - - - -2009-11-15T17:17:09.000000Z -11b021f9fcf4f267cd2df04168faab23 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -5091 - -rtnetlink.c -file - - - - -2009-11-15T17:17:09.000000Z -b7c7236216ef1fd286f7bc9587d60f6d -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -34400 - diff --git a/libdde_linux26/lib/src/net/core/.svn/format b/libdde_linux26/lib/src/net/core/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/net/core/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/net/core/.svn/text-base/dev.c.svn-base b/libdde_linux26/lib/src/net/core/.svn/text-base/dev.c.svn-base deleted file mode 100644 index 22fdf4d7..00000000 --- a/libdde_linux26/lib/src/net/core/.svn/text-base/dev.c.svn-base +++ /dev/null @@ -1,5302 +0,0 @@ -/* - * NET3 Protocol independent device support routines. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Derived from the non IP parts of dev.c 1.0.19 - * Authors: Ross Biro - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> - * Mark Evans, <evansmp@uhura.aston.ac.uk> - * - * Additional Authors: - * Florian la Roche <rzsfl@rz.uni-sb.de> - * Alan Cox <gw4pts@gw4pts.ampr.org> - * David Hinds <dahinds@users.sourceforge.net> - * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> - * Adam Sulmicki <adam@cfar.umd.edu> - * Pekka Riikonen <priikone@poesidon.pspt.fi> - * - * Changes: - * D.J. Barrow : Fixed bug where dev->refcnt gets set - * to 2 if register_netdev gets called - * before net_dev_init & also removed a - * few lines of code in the process. - * Alan Cox : device private ioctl copies fields back. - * Alan Cox : Transmit queue code does relevant - * stunts to keep the queue safe. - * Alan Cox : Fixed double lock. - * Alan Cox : Fixed promisc NULL pointer trap - * ???????? : Support the full private ioctl range - * Alan Cox : Moved ioctl permission check into - * drivers - * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI - * Alan Cox : 100 backlog just doesn't cut it when - * you start doing multicast video 8) - * Alan Cox : Rewrote net_bh and list manager. - * Alan Cox : Fix ETH_P_ALL echoback lengths. - * Alan Cox : Took out transmit every packet pass - * Saved a few bytes in the ioctl handler - * Alan Cox : Network driver sets packet type before - * calling netif_rx. Saves a function - * call a packet. - * Alan Cox : Hashed net_bh() - * Richard Kooijman: Timestamp fixes. - * Alan Cox : Wrong field in SIOCGIFDSTADDR - * Alan Cox : Device lock protection. - * Alan Cox : Fixed nasty side effect of device close - * changes. - * Rudi Cilibrasi : Pass the right thing to - * set_mac_address() - * Dave Miller : 32bit quantity for the device lock to - * make it work out on a Sparc. - * Bjorn Ekwall : Added KERNELD hack. - * Alan Cox : Cleaned up the backlog initialise. - * Craig Metz : SIOCGIFCONF fix if space for under - * 1 device. - * Thomas Bogendoerfer : Return ENODEV for dev_open, if there - * is no device open function. - * Andi Kleen : Fix error reporting for SIOCGIFCONF - * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF - * Cyrus Durgin : Cleaned for KMOD - * Adam Sulmicki : Bug Fix : Network Device Unload - * A network device unload needs to purge - * the backlog queue. - * Paul Rusty Russell : SIOCSIFNAME - * Pekka Riikonen : Netdev boot-time settings code - * Andrew Morton : Make unregister_netdevice wait - * indefinitely on dev->refcnt - * J Hadi Salim : - Backlog queue sampling - * - netif_rx() feedback - */ - -#ifdef DDE_LINUX -#include "local.h" -#include <l4/dde/linux26/dde26_net.h> -#endif - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/capability.h> -#include <linux/cpu.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mutex.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/ethtool.h> -#include <linux/notifier.h> -#include <linux/skbuff.h> -#include <net/net_namespace.h> -#include <net/sock.h> -#include <linux/rtnetlink.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/stat.h> -#include <linux/if_bridge.h> -#include <linux/if_macvlan.h> -#include <net/dst.h> -#include <net/pkt_sched.h> -#include <net/checksum.h> -#include <linux/highmem.h> -#include <linux/init.h> -#include <linux/kmod.h> -#include <linux/module.h> -#include <linux/netpoll.h> -#include <linux/rcupdate.h> -#include <linux/delay.h> -#include <net/wext.h> -#include <net/iw_handler.h> -#include <asm/current.h> -#include <linux/audit.h> -#include <linux/dmaengine.h> -#include <linux/err.h> -#include <linux/ctype.h> -#include <linux/if_arp.h> -#include <linux/if_vlan.h> -#include <linux/ip.h> -#include <net/ip.h> -#include <linux/ipv6.h> -#include <linux/in.h> -#include <linux/jhash.h> -#include <linux/random.h> - -#include "net-sysfs.h" - -/* Instead of increasing this, you should create a hash table. */ -#define MAX_GRO_SKBS 8 - -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) - -/* - * The list of packet types we will receive (as opposed to discard) - * and the routines to invoke. - * - * Why 16. Because with 16 the only overlap we get on a hash of the - * low nibble of the protocol value is RARP/SNAP/X.25. - * - * NOTE: That is no longer true with the addition of VLAN tags. Not - * sure which should go first, but I bet it won't make much - * difference if we are running VLANs. The good news is that - * this protocol won't be in the list unless compiled in, so - * the average user (w/out VLANs) will not be adversely affected. - * --BLG - * - * 0800 IP - * 8100 802.1Q VLAN - * 0001 802.3 - * 0002 AX.25 - * 0004 802.2 - * 8035 RARP - * 0005 SNAP - * 0805 X.25 - * 0806 ARP - * 8137 IPX - * 0009 Localtalk - * 86DD IPv6 - */ - -#define PTYPE_HASH_SIZE (16) -#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) - -static DEFINE_SPINLOCK(ptype_lock); -static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -static struct list_head ptype_all __read_mostly; /* Taps */ - -/* - * The @dev_base_head list is protected by @dev_base_lock and the rtnl - * semaphore. - * - * Pure readers hold dev_base_lock for reading. - * - * Writers must hold the rtnl semaphore while they loop through the - * dev_base_head list, and hold dev_base_lock for writing when they do the - * actual updates. This allows pure readers to access the list even - * while a writer is preparing to update it. - * - * To put it another way, dev_base_lock is held for writing only to - * protect against pure readers; the rtnl semaphore provides the - * protection against other writers. - * - * See, for example usages, register_netdevice() and - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -DEFINE_RWLOCK(dev_base_lock); - -EXPORT_SYMBOL(dev_base_lock); - -#define NETDEV_HASHBITS 8 -#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) - -static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) -{ - unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; -} - -static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) -{ - return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; -} - -/* Device list insertion */ -static int list_netdevice(struct net_device *dev) -{ - struct net *net = dev_net(dev); - - ASSERT_RTNL(); - - write_lock_bh(&dev_base_lock); - list_add_tail(&dev->dev_list, &net->dev_base_head); - hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); - hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock_bh(&dev_base_lock); - return 0; -} - -/* Device list removal */ -static void unlist_netdevice(struct net_device *dev) -{ - ASSERT_RTNL(); - - /* Unlink dev from the device chain */ - write_lock_bh(&dev_base_lock); - list_del(&dev->dev_list); - hlist_del(&dev->name_hlist); - hlist_del(&dev->index_hlist); - write_unlock_bh(&dev_base_lock); -} - -/* - * Our notifier list - */ - -static RAW_NOTIFIER_HEAD(netdev_chain); - -/* - * Device drivers call our routines to queue packets here. We empty the - * queue in the local softnet handler. - */ - -DEFINE_PER_CPU(struct softnet_data, softnet_data); - -#ifdef CONFIG_LOCKDEP -/* - * register_netdevice() inits txq->_xmit_lock and sets lockdep class - * according to dev->type - */ -static const unsigned short netdev_lock_type[] = - {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, - ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, - ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, - ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, - ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, - ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, - ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, - ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, - ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, - ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, - ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, - ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, - ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, - ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, - ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE}; - -static const char *netdev_lock_name[] = - {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", - "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", - "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", - "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", - "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", - "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", - "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", - "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", - "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", - "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", - "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", - "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", - "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", - "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", - "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"}; - -static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; -static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; - -static inline unsigned short netdev_lock_pos(unsigned short dev_type) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) - if (netdev_lock_type[i] == dev_type) - return i; - /* the last key is used by default */ - return ARRAY_SIZE(netdev_lock_type) - 1; -} - -static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, - unsigned short dev_type) -{ - int i; - - i = netdev_lock_pos(dev_type); - lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], - netdev_lock_name[i]); -} - -static inline void netdev_set_addr_lockdep_class(struct net_device *dev) -{ - int i; - - i = netdev_lock_pos(dev->type); - lockdep_set_class_and_name(&dev->addr_list_lock, - &netdev_addr_lock_key[i], - netdev_lock_name[i]); -} -#else -static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, - unsigned short dev_type) -{ -} -static inline void netdev_set_addr_lockdep_class(struct net_device *dev) -{ -} -#endif - -/******************************************************************************* - - Protocol management and registration routines - -*******************************************************************************/ - -/* - * Add a protocol ID to the list. Now that the input handler is - * smarter we can dispense with all the messy stuff that used to be - * here. - * - * BEWARE!!! Protocol handlers, mangling input packets, - * MUST BE last in hash buckets and checking protocol handlers - * MUST start from promiscuous ptype_all chain in net_bh. - * It is true now, do not change it. - * Explanation follows: if protocol handler, mangling packet, will - * be the first on list, it is not able to sense, that packet - * is cloned and should be copied-on-write, so that it will - * change it and subsequent readers will get broken packet. - * --ANK (980803) - */ - -/** - * dev_add_pack - add packet handler - * @pt: packet type declaration - * - * Add a protocol handler to the networking stack. The passed &packet_type - * is linked into kernel lists and may not be freed until it has been - * removed from the kernel lists. - * - * This call does not sleep therefore it can not - * guarantee all CPU's that are in middle of receiving packets - * will see the new packet type (until the next received packet). - */ - -void dev_add_pack(struct packet_type *pt) -{ - int hash; - - spin_lock_bh(&ptype_lock); - if (pt->type == htons(ETH_P_ALL)) - list_add_rcu(&pt->list, &ptype_all); - else { - hash = ntohs(pt->type) & PTYPE_HASH_MASK; - list_add_rcu(&pt->list, &ptype_base[hash]); - } - spin_unlock_bh(&ptype_lock); -} - -/** - * __dev_remove_pack - remove packet handler - * @pt: packet type declaration - * - * Remove a protocol handler that was previously added to the kernel - * protocol handlers by dev_add_pack(). The passed &packet_type is removed - * from the kernel lists and can be freed or reused once this function - * returns. - * - * The packet type might still be in use by receivers - * and must not be freed until after all the CPU's have gone - * through a quiescent state. - */ -void __dev_remove_pack(struct packet_type *pt) -{ - struct list_head *head; - struct packet_type *pt1; - - spin_lock_bh(&ptype_lock); - - if (pt->type == htons(ETH_P_ALL)) - head = &ptype_all; - else - head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; - - list_for_each_entry(pt1, head, list) { - if (pt == pt1) { - list_del_rcu(&pt->list); - goto out; - } - } - - printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); -out: - spin_unlock_bh(&ptype_lock); -} -/** - * dev_remove_pack - remove packet handler - * @pt: packet type declaration - * - * Remove a protocol handler that was previously added to the kernel - * protocol handlers by dev_add_pack(). The passed &packet_type is removed - * from the kernel lists and can be freed or reused once this function - * returns. - * - * This call sleeps to guarantee that no CPU is looking at the packet - * type after return. - */ -void dev_remove_pack(struct packet_type *pt) -{ - __dev_remove_pack(pt); - - synchronize_net(); -} - -/****************************************************************************** - - Device Boot-time Settings Routines - -*******************************************************************************/ - -/* Boot time configuration table */ -static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; - -/** - * netdev_boot_setup_add - add new setup entry - * @name: name of the device - * @map: configured settings for the device - * - * Adds new setup entry to the dev_boot_setup list. The function - * returns 0 on error and 1 on success. This is a generic routine to - * all netdevices. - */ -static int netdev_boot_setup_add(char *name, struct ifmap *map) -{ - struct netdev_boot_setup *s; - int i; - - s = dev_boot_setup; - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { - memset(s[i].name, 0, sizeof(s[i].name)); - strlcpy(s[i].name, name, IFNAMSIZ); - memcpy(&s[i].map, map, sizeof(s[i].map)); - break; - } - } - - return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; -} - -/** - * netdev_boot_setup_check - check boot time settings - * @dev: the netdevice - * - * Check boot time settings for the device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found, 1 if they are. - */ -int netdev_boot_setup_check(struct net_device *dev) -{ - struct netdev_boot_setup *s = dev_boot_setup; - int i; - - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && - !strcmp(dev->name, s[i].name)) { - dev->irq = s[i].map.irq; - dev->base_addr = s[i].map.base_addr; - dev->mem_start = s[i].map.mem_start; - dev->mem_end = s[i].map.mem_end; - return 1; - } - } - return 0; -} - - -/** - * netdev_boot_base - get address from boot time settings - * @prefix: prefix for network device - * @unit: id for network device - * - * Check boot time settings for the base address of device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found. - */ -unsigned long netdev_boot_base(const char *prefix, int unit) -{ - const struct netdev_boot_setup *s = dev_boot_setup; - char name[IFNAMSIZ]; - int i; - - sprintf(name, "%s%d", prefix, unit); - - /* - * If device already registered then return base of 1 - * to indicate not to probe for this interface - */ - if (__dev_get_by_name(&init_net, name)) - return 1; - - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) - if (!strcmp(name, s[i].name)) - return s[i].map.base_addr; - return 0; -} - -/* - * Saves at boot time configured settings for any netdevice. - */ -int __init netdev_boot_setup(char *str) -{ - int ints[5]; - struct ifmap map; - - str = get_options(str, ARRAY_SIZE(ints), ints); - if (!str || !*str) - return 0; - - /* Save settings */ - memset(&map, 0, sizeof(map)); - if (ints[0] > 0) - map.irq = ints[1]; - if (ints[0] > 1) - map.base_addr = ints[2]; - if (ints[0] > 2) - map.mem_start = ints[3]; - if (ints[0] > 3) - map.mem_end = ints[4]; - - /* Add new entry to the list */ - return netdev_boot_setup_add(str, &map); -} - -__setup("netdev=", netdev_boot_setup); - -/******************************************************************************* - - Device Interface Subroutines - -*******************************************************************************/ - -/** - * __dev_get_by_name - find a device by its name - * @net: the applicable net namespace - * @name: name to find - * - * Find an interface by name. Must be called under RTNL semaphore - * or @dev_base_lock. If the name is found a pointer to the device - * is returned. If the name is not found then %NULL is returned. The - * reference counters are not incremented so the caller must be - * careful with locks. - */ - -struct net_device *__dev_get_by_name(struct net *net, const char *name) -{ - struct hlist_node *p; - - hlist_for_each(p, dev_name_hash(net, name)) { - struct net_device *dev - = hlist_entry(p, struct net_device, name_hlist); - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; - } - return NULL; -} - -/** - * dev_get_by_name - find a device by its name - * @net: the applicable net namespace - * @name: name to find - * - * Find an interface by name. This can be called from any - * context and does its own locking. The returned handle has - * the usage count incremented and the caller must use dev_put() to - * release it when it is no longer needed. %NULL is returned if no - * matching device is found. - */ - -struct net_device *dev_get_by_name(struct net *net, const char *name) -{ - struct net_device *dev; - - read_lock(&dev_base_lock); - dev = __dev_get_by_name(net, name); - if (dev) - dev_hold(dev); - read_unlock(&dev_base_lock); - return dev; -} - -/** - * __dev_get_by_index - find a device by its ifindex - * @net: the applicable net namespace - * @ifindex: index of device - * - * Search for an interface by index. Returns %NULL if the device - * is not found or a pointer to the device. The device has not - * had its reference counter increased so the caller must be careful - * about locking. The caller must hold either the RTNL semaphore - * or @dev_base_lock. - */ - -struct net_device *__dev_get_by_index(struct net *net, int ifindex) -{ - struct hlist_node *p; - - hlist_for_each(p, dev_index_hash(net, ifindex)) { - struct net_device *dev - = hlist_entry(p, struct net_device, index_hlist); - if (dev->ifindex == ifindex) - return dev; - } - return NULL; -} - - -/** - * dev_get_by_index - find a device by its ifindex - * @net: the applicable net namespace - * @ifindex: index of device - * - * Search for an interface by index. Returns NULL if the device - * is not found or a pointer to the device. The device returned has - * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. - */ - -struct net_device *dev_get_by_index(struct net *net, int ifindex) -{ - struct net_device *dev; - - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifindex); - if (dev) - dev_hold(dev); - read_unlock(&dev_base_lock); - return dev; -} - -/** - * dev_getbyhwaddr - find a device by its hardware address - * @net: the applicable net namespace - * @type: media type of device - * @ha: hardware address - * - * Search for an interface by MAC address. Returns NULL if the device - * is not found or a pointer to the device. The caller must hold the - * rtnl semaphore. The returned device has not had its ref count increased - * and the caller must therefore be careful about locking - * - * BUGS: - * If the API was consistent this would be __dev_get_by_hwaddr - */ - -struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) -{ - struct net_device *dev; - - ASSERT_RTNL(); - - for_each_netdev(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) - return dev; - - return NULL; -} - -EXPORT_SYMBOL(dev_getbyhwaddr); - -struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) -{ - struct net_device *dev; - - ASSERT_RTNL(); - for_each_netdev(net, dev) - if (dev->type == type) - return dev; - - return NULL; -} - -EXPORT_SYMBOL(__dev_getfirstbyhwtype); - -struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) -{ - struct net_device *dev; - - rtnl_lock(); - dev = __dev_getfirstbyhwtype(net, type); - if (dev) - dev_hold(dev); - rtnl_unlock(); - return dev; -} - -EXPORT_SYMBOL(dev_getfirstbyhwtype); - -/** - * dev_get_by_flags - find any device with given flags - * @net: the applicable net namespace - * @if_flags: IFF_* values - * @mask: bitmask of bits in if_flags to check - * - * Search for any interface with the given flags. Returns NULL if a device - * is not found or a pointer to the device. The device returned has - * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. - */ - -struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) -{ - struct net_device *dev, *ret; - - ret = NULL; - read_lock(&dev_base_lock); - for_each_netdev(net, dev) { - if (((dev->flags ^ if_flags) & mask) == 0) { - dev_hold(dev); - ret = dev; - break; - } - } - read_unlock(&dev_base_lock); - return ret; -} - -/** - * dev_valid_name - check if name is okay for network device - * @name: name string - * - * Network device names need to be valid file names to - * to allow sysfs to work. We also disallow any kind of - * whitespace. - */ -int dev_valid_name(const char *name) -{ - if (*name == '\0') - return 0; - if (strlen(name) >= IFNAMSIZ) - return 0; - if (!strcmp(name, ".") || !strcmp(name, "..")) - return 0; - - while (*name) { - if (*name == '/' || isspace(*name)) - return 0; - name++; - } - return 1; -} - -/** - * __dev_alloc_name - allocate a name for a device - * @net: network namespace to allocate the device name in - * @name: name format string - * @buf: scratch buffer and result name string - * - * Passed a format string - eg "lt%d" it will try and find a suitable - * id. It scans list of devices to build up a free map, then chooses - * the first empty slot. The caller must hold the dev_base or rtnl lock - * while allocating the name and adding the device in order to avoid - * duplicates. - * Limited to bits_per_byte * page size devices (ie 32K on most platforms). - * Returns the number of the unit assigned or a negative errno code. - */ - -static int __dev_alloc_name(struct net *net, const char *name, char *buf) -{ - int i = 0; - const char *p; - const int max_netdevices = 8*PAGE_SIZE; - unsigned long *inuse; - struct net_device *d; - - p = strnchr(name, IFNAMSIZ-1, '%'); - if (p) { - /* - * Verify the string as this thing may have come from - * the user. There must be either one "%d" and no other "%" - * characters. - */ - if (p[1] != 'd' || strchr(p + 2, '%')) - return -EINVAL; - - /* Use one page as a bit array of possible slots */ - inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); - if (!inuse) - return -ENOMEM; - - for_each_netdev(net, d) { - if (!sscanf(d->name, name, &i)) - continue; - if (i < 0 || i >= max_netdevices) - continue; - - /* avoid cases where sscanf is not exact inverse of printf */ - snprintf(buf, IFNAMSIZ, name, i); - if (!strncmp(buf, d->name, IFNAMSIZ)) - set_bit(i, inuse); - } - - i = find_first_zero_bit(inuse, max_netdevices); - free_page((unsigned long) inuse); - } - - snprintf(buf, IFNAMSIZ, name, i); - if (!__dev_get_by_name(net, buf)) - return i; - - /* It is possible to run out of possible slots - * when the name is long and there isn't enough space left - * for the digits, or if all bits are used. - */ - return -ENFILE; -} - -/** - * dev_alloc_name - allocate a name for a device - * @dev: device - * @name: name format string - * - * Passed a format string - eg "lt%d" it will try and find a suitable - * id. It scans list of devices to build up a free map, then chooses - * the first empty slot. The caller must hold the dev_base or rtnl lock - * while allocating the name and adding the device in order to avoid - * duplicates. - * Limited to bits_per_byte * page size devices (ie 32K on most platforms). - * Returns the number of the unit assigned or a negative errno code. - */ - -int dev_alloc_name(struct net_device *dev, const char *name) -{ - char buf[IFNAMSIZ]; - struct net *net; - int ret; - - BUG_ON(!dev_net(dev)); - net = dev_net(dev); - ret = __dev_alloc_name(net, name, buf); - if (ret >= 0) - strlcpy(dev->name, buf, IFNAMSIZ); - return ret; -} - - -/** - * dev_change_name - change name of a device - * @dev: device - * @newname: name (or format string) must be at least IFNAMSIZ - * - * Change name of a device, can pass format strings "eth%d". - * for wildcarding. - */ -int dev_change_name(struct net_device *dev, const char *newname) -{ - char oldname[IFNAMSIZ]; - int err = 0; - int ret; - struct net *net; - - ASSERT_RTNL(); - BUG_ON(!dev_net(dev)); - - net = dev_net(dev); - if (dev->flags & IFF_UP) - return -EBUSY; - - if (!dev_valid_name(newname)) - return -EINVAL; - - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) - return 0; - - memcpy(oldname, dev->name, IFNAMSIZ); - - if (strchr(newname, '%')) { - err = dev_alloc_name(dev, newname); - if (err < 0) - return err; - } - else if (__dev_get_by_name(net, newname)) - return -EEXIST; - else - strlcpy(dev->name, newname, IFNAMSIZ); - -rollback: - /* For now only devices in the initial network namespace - * are in sysfs. - */ - if (net == &init_net) { - ret = device_rename(&dev->dev, dev->name); - if (ret) { - memcpy(dev->name, oldname, IFNAMSIZ); - return ret; - } - } - - write_lock_bh(&dev_base_lock); - hlist_del(&dev->name_hlist); - hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); - write_unlock_bh(&dev_base_lock); - - ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); - ret = notifier_to_errno(ret); - - if (ret) { - if (err) { - printk(KERN_ERR - "%s: name change rollback failed: %d.\n", - dev->name, ret); - } else { - err = ret; - memcpy(dev->name, oldname, IFNAMSIZ); - goto rollback; - } - } - - return err; -} - -/** - * dev_set_alias - change ifalias of a device - * @dev: device - * @alias: name up to IFALIASZ - * @len: limit of bytes to copy from info - * - * Set ifalias for a device, - */ -int dev_set_alias(struct net_device *dev, const char *alias, size_t len) -{ - ASSERT_RTNL(); - - if (len >= IFALIASZ) - return -EINVAL; - - if (!len) { - if (dev->ifalias) { - kfree(dev->ifalias); - dev->ifalias = NULL; - } - return 0; - } - - dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL); - if (!dev->ifalias) - return -ENOMEM; - - strlcpy(dev->ifalias, alias, len+1); - return len; -} - - -/** - * netdev_features_change - device changes features - * @dev: device to cause notification - * - * Called to indicate a device has changed features. - */ -void netdev_features_change(struct net_device *dev) -{ - call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); -} -EXPORT_SYMBOL(netdev_features_change); - -/** - * netdev_state_change - device changes state - * @dev: device to cause notification - * - * Called to indicate a device has changed state. This function calls - * the notifier chains for netdev_chain and sends a NEWLINK message - * to the routing socket. - */ -void netdev_state_change(struct net_device *dev) -{ - if (dev->flags & IFF_UP) { - call_netdevice_notifiers(NETDEV_CHANGE, dev); - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); - } -} - -void netdev_bonding_change(struct net_device *dev) -{ - call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev); -} -EXPORT_SYMBOL(netdev_bonding_change); - -/** - * dev_load - load a network module - * @net: the applicable net namespace - * @name: name of interface - * - * If a network interface is not present and the process has suitable - * privileges this function loads the module. If module loading is not - * available in this kernel then it becomes a nop. - */ - -void dev_load(struct net *net, const char *name) -{ - struct net_device *dev; - - read_lock(&dev_base_lock); - dev = __dev_get_by_name(net, name); - read_unlock(&dev_base_lock); - - if (!dev && capable(CAP_SYS_MODULE)) - request_module("%s", name); -} - -/** - * dev_open - prepare an interface for use. - * @dev: device to open - * - * Takes a device from down to up state. The device's private open - * function is invoked and then the multicast lists are loaded. Finally - * the device is moved into the up state and a %NETDEV_UP message is - * sent to the netdev notifier chain. - * - * Calling this function on an active interface is a nop. On a failure - * a negative errno code is returned. - */ -int dev_open(struct net_device *dev) -{ - const struct net_device_ops *ops = dev->netdev_ops; - int ret = 0; - - ASSERT_RTNL(); - - /* - * Is it already up? - */ - - if (dev->flags & IFF_UP) - return 0; - - /* - * Is it even present? - */ - if (!netif_device_present(dev)) - return -ENODEV; - - /* - * Call device private open method - */ - set_bit(__LINK_STATE_START, &dev->state); - - if (ops->ndo_validate_addr) - ret = ops->ndo_validate_addr(dev); - - if (!ret && ops->ndo_open) - ret = ops->ndo_open(dev); - - /* - * If it went open OK then: - */ - - if (ret) - clear_bit(__LINK_STATE_START, &dev->state); - else { - /* - * Set the flags. - */ - dev->flags |= IFF_UP; - - /* - * Enable NET_DMA - */ - net_dmaengine_get(); - - /* - * Initialize multicasting status - */ - dev_set_rx_mode(dev); - - /* - * Wakeup transmit queue engine - */ - dev_activate(dev); - - /* - * ... and announce new interface. - */ - call_netdevice_notifiers(NETDEV_UP, dev); - } - - return ret; -} - -/** - * dev_close - shutdown an interface. - * @dev: device to shutdown - * - * This function moves an active device into down state. A - * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - * chain. - */ -int dev_close(struct net_device *dev) -{ - const struct net_device_ops *ops = dev->netdev_ops; - ASSERT_RTNL(); - - might_sleep(); - - if (!(dev->flags & IFF_UP)) - return 0; - - /* - * Tell people we are going down, so that they can - * prepare to death, when device is still operating. - */ - call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); - - clear_bit(__LINK_STATE_START, &dev->state); - - /* Synchronize to scheduled poll. We cannot touch poll list, - * it can be even on different cpu. So just clear netif_running(). - * - * dev->stop() will invoke napi_disable() on all of it's - * napi_struct instances on this device. - */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ - - dev_deactivate(dev); - - /* - * Call the device specific close. This cannot fail. - * Only if device is UP - * - * We allow it to be called even after a DETACH hot-plug - * event. - */ - if (ops->ndo_stop) - ops->ndo_stop(dev); - - /* - * Device is now down. - */ - - dev->flags &= ~IFF_UP; - - /* - * Tell people we are down - */ - call_netdevice_notifiers(NETDEV_DOWN, dev); - - /* - * Shutdown NET_DMA - */ - net_dmaengine_put(); - - return 0; -} - - -/** - * dev_disable_lro - disable Large Receive Offload on a device - * @dev: device - * - * Disable Large Receive Offload (LRO) on a net device. Must be - * called under RTNL. This is needed if received packets may be - * forwarded to another interface. - */ -void dev_disable_lro(struct net_device *dev) -{ - if (dev->ethtool_ops && dev->ethtool_ops->get_flags && - dev->ethtool_ops->set_flags) { - u32 flags = dev->ethtool_ops->get_flags(dev); - if (flags & ETH_FLAG_LRO) { - flags &= ~ETH_FLAG_LRO; - dev->ethtool_ops->set_flags(dev, flags); - } - } - WARN_ON(dev->features & NETIF_F_LRO); -} -EXPORT_SYMBOL(dev_disable_lro); - - -static int dev_boot_phase = 1; - -/* - * Device change register/unregister. These are not inline or static - * as we export them to the world. - */ - -/** - * register_netdevice_notifier - register a network notifier block - * @nb: notifier - * - * Register a notifier to be called when network device events occur. - * The notifier passed is linked into the kernel structures and must - * not be reused until it has been unregistered. A negative errno code - * is returned on a failure. - * - * When registered all registration and up events are replayed - * to the new notifier to allow device to have a race free - * view of the network device list. - */ - -int register_netdevice_notifier(struct notifier_block *nb) -{ - struct net_device *dev; - struct net_device *last; - struct net *net; - int err; - - rtnl_lock(); - err = raw_notifier_chain_register(&netdev_chain, nb); - if (err) - goto unlock; - if (dev_boot_phase) - goto unlock; - for_each_net(net) { - for_each_netdev(net, dev) { - err = nb->notifier_call(nb, NETDEV_REGISTER, dev); - err = notifier_to_errno(err); - if (err) - goto rollback; - - if (!(dev->flags & IFF_UP)) - continue; - - nb->notifier_call(nb, NETDEV_UP, dev); - } - } - -unlock: - rtnl_unlock(); - return err; - -rollback: - last = dev; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev == last) - break; - - if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); - } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); - } - } - - raw_notifier_chain_unregister(&netdev_chain, nb); - goto unlock; -} - -/** - * unregister_netdevice_notifier - unregister a network notifier block - * @nb: notifier - * - * Unregister a notifier previously registered by - * register_netdevice_notifier(). The notifier is unlinked into the - * kernel structures and may then be reused. A negative errno code - * is returned on a failure. - */ - -int unregister_netdevice_notifier(struct notifier_block *nb) -{ - int err; - - rtnl_lock(); - err = raw_notifier_chain_unregister(&netdev_chain, nb); - rtnl_unlock(); - return err; -} - -/** - * call_netdevice_notifiers - call all network notifier blocks - * @val: value passed unmodified to notifier function - * @dev: net_device pointer passed unmodified to notifier function - * - * Call all network notifier blocks. Parameters and return value - * are as for raw_notifier_call_chain(). - */ - -int call_netdevice_notifiers(unsigned long val, struct net_device *dev) -{ - return raw_notifier_call_chain(&netdev_chain, val, dev); -} - -/* When > 0 there are consumers of rx skb time stamps */ -static atomic_t netstamp_needed = ATOMIC_INIT(0); - -void net_enable_timestamp(void) -{ - atomic_inc(&netstamp_needed); -} - -void net_disable_timestamp(void) -{ - atomic_dec(&netstamp_needed); -} - -static inline void net_timestamp(struct sk_buff *skb) -{ - if (atomic_read(&netstamp_needed)) - __net_timestamp(skb); - else - skb->tstamp.tv64 = 0; -} - -/* - * Support routine. Sends outgoing frames to any network - * taps currently in use. - */ - -static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) -{ - struct packet_type *ptype; - - net_timestamp(skb); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_all, list) { - /* Never send packets back to the socket - * they originated from - MvS (miquels@drinkel.ow.org) - */ - if ((ptype->dev == dev || !ptype->dev) && - (ptype->af_packet_priv == NULL || - (struct sock *)ptype->af_packet_priv != skb->sk)) { - struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); - if (!skb2) - break; - - /* skb->nh should be correctly - set by sender, so that the second statement is - just protection against buggy protocols. - */ - skb_reset_mac_header(skb2); - - if (skb_network_header(skb2) < skb2->data || - skb2->network_header > skb2->tail) { - if (net_ratelimit()) - printk(KERN_CRIT "protocol %04x is " - "buggy, dev %s\n", - skb2->protocol, dev->name); - skb_reset_network_header(skb2); - } - - skb2->transport_header = skb2->network_header; - skb2->pkt_type = PACKET_OUTGOING; - ptype->func(skb2, skb->dev, ptype, skb->dev); - } - } - rcu_read_unlock(); -} - - -static inline void __netif_reschedule(struct Qdisc *q) -{ - struct softnet_data *sd; - unsigned long flags; - - local_irq_save(flags); - sd = &__get_cpu_var(softnet_data); - q->next_sched = sd->output_queue; - sd->output_queue = q; - raise_softirq_irqoff(NET_TX_SOFTIRQ); - local_irq_restore(flags); -} - -void __netif_schedule(struct Qdisc *q) -{ - if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) - __netif_reschedule(q); -} -EXPORT_SYMBOL(__netif_schedule); - -void dev_kfree_skb_irq(struct sk_buff *skb) -{ - if (atomic_dec_and_test(&skb->users)) { - struct softnet_data *sd; - unsigned long flags; - - local_irq_save(flags); - sd = &__get_cpu_var(softnet_data); - skb->next = sd->completion_queue; - sd->completion_queue = skb; - raise_softirq_irqoff(NET_TX_SOFTIRQ); - local_irq_restore(flags); - } -} -EXPORT_SYMBOL(dev_kfree_skb_irq); - -void dev_kfree_skb_any(struct sk_buff *skb) -{ - if (in_irq() || irqs_disabled()) - dev_kfree_skb_irq(skb); - else - dev_kfree_skb(skb); -} -EXPORT_SYMBOL(dev_kfree_skb_any); - - -/** - * netif_device_detach - mark device as removed - * @dev: network device - * - * Mark device as removed from system and therefore no longer available. - */ -void netif_device_detach(struct net_device *dev) -{ - if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && - netif_running(dev)) { - netif_stop_queue(dev); - } -} -EXPORT_SYMBOL(netif_device_detach); - -/** - * netif_device_attach - mark device as attached - * @dev: network device - * - * Mark device as attached from system and restart if needed. - */ -void netif_device_attach(struct net_device *dev) -{ - if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && - netif_running(dev)) { - netif_wake_queue(dev); - __netdev_watchdog_up(dev); - } -} -EXPORT_SYMBOL(netif_device_attach); - -static bool can_checksum_protocol(unsigned long features, __be16 protocol) -{ - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_IP_CSUM) && - protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_IPV6_CSUM) && - protocol == htons(ETH_P_IPV6))); -} - -static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) -{ - if (can_checksum_protocol(dev->features, skb->protocol)) - return true; - - if (skb->protocol == htons(ETH_P_8021Q)) { - struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - if (can_checksum_protocol(dev->features & dev->vlan_features, - veh->h_vlan_encapsulated_proto)) - return true; - } - - return false; -} - -/* - * Invalidate hardware checksum when packet is to be mangled, and - * complete checksum manually on outgoing path. - */ -int skb_checksum_help(struct sk_buff *skb) -{ - __wsum csum; - int ret = 0, offset; - - if (skb->ip_summed == CHECKSUM_COMPLETE) - goto out_set_summed; - - if (unlikely(skb_shinfo(skb)->gso_size)) { - /* Let GSO fix up the checksum. */ - goto out_set_summed; - } - - offset = skb->csum_start - skb_headroom(skb); - BUG_ON(offset >= skb_headlen(skb)); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - - offset += skb->csum_offset; - BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__sum16))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } - - *(__sum16 *)(skb->data + offset) = csum_fold(csum); -out_set_summed: - skb->ip_summed = CHECKSUM_NONE; -out: - return ret; -} - -/** - * skb_gso_segment - Perform segmentation on skb. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * - * This function segments the given skb and returns a list of segments. - * - * It may return NULL if the skb requires no segmentation. This is - * only possible when GSO is used for verifying header integrity. - */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_type *ptype; - __be16 type = skb->protocol; - int err; - - skb_reset_mac_header(skb); - skb->mac_len = skb->network_header - skb->mac_header; - __skb_pull(skb, skb->mac_len); - - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - struct net_device *dev = skb->dev; - struct ethtool_drvinfo info = {}; - - if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) - dev->ethtool_ops->get_drvinfo(dev, &info); - - WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " - "ip_summed=%d", - info.driver, dev ? dev->features : 0L, - skb->sk ? skb->sk->sk_route_caps : 0L, - skb->len, skb->data_len, skb->ip_summed); - - if (skb_header_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) - return ERR_PTR(err); - } - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, - &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && !ptype->dev && ptype->gso_segment) { - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - err = ptype->gso_send_check(skb); - segs = ERR_PTR(err); - if (err || skb_gso_ok(skb, features)) - break; - __skb_push(skb, (skb->data - - skb_network_header(skb))); - } - segs = ptype->gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - __skb_push(skb, skb->data - skb_mac_header(skb)); - - return segs; -} - -EXPORT_SYMBOL(skb_gso_segment); - -/* Take action when hardware reception checksum errors are detected. */ -#ifdef CONFIG_BUG -void netdev_rx_csum_fault(struct net_device *dev) -{ - if (net_ratelimit()) { - printk(KERN_ERR "%s: hw csum failure.\n", - dev ? dev->name : "<unknown>"); - dump_stack(); - } -} -EXPORT_SYMBOL(netdev_rx_csum_fault); -#endif - -/* Actually, we should eliminate this check as soon as we know, that: - * 1. IOMMU is present and allows to map all the memory. - * 2. No high memory really exists on this machine. - */ - -static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) -{ -#ifdef CONFIG_HIGHMEM - int i; - - if (dev->features & NETIF_F_HIGHDMA) - return 0; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - if (PageHighMem(skb_shinfo(skb)->frags[i].page)) - return 1; - -#endif - return 0; -} - -struct dev_gso_cb { - void (*destructor)(struct sk_buff *skb); -}; - -#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) - -static void dev_gso_skb_destructor(struct sk_buff *skb) -{ - struct dev_gso_cb *cb; - - do { - struct sk_buff *nskb = skb->next; - - skb->next = nskb->next; - nskb->next = NULL; - kfree_skb(nskb); - } while (skb->next); - - cb = DEV_GSO_CB(skb); - if (cb->destructor) - cb->destructor(skb); -} - -/** - * dev_gso_segment - Perform emulated hardware segmentation on skb. - * @skb: buffer to segment - * - * This function segments the given skb and stores the list of segments - * in skb->next. - */ -static int dev_gso_segment(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct sk_buff *segs; - int features = dev->features & ~(illegal_highdma(dev, skb) ? - NETIF_F_SG : 0); - - segs = skb_gso_segment(skb, features); - - /* Verifying header integrity only. */ - if (!segs) - return 0; - - if (IS_ERR(segs)) - return PTR_ERR(segs); - - skb->next = segs; - DEV_GSO_CB(skb)->destructor = skb->destructor; - skb->destructor = dev_gso_skb_destructor; - - return 0; -} - -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) -{ - const struct net_device_ops *ops = dev->netdev_ops; - - prefetch(&dev->netdev_ops->ndo_start_xmit); - if (likely(!skb->next)) { - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); - - if (netif_needs_gso(dev, skb)) { - if (unlikely(dev_gso_segment(skb))) - goto out_kfree_skb; - if (skb->next) - goto gso; - } - - return ops->ndo_start_xmit(skb, dev); - } - -gso: - do { - struct sk_buff *nskb = skb->next; - int rc; - - skb->next = nskb->next; - nskb->next = NULL; - rc = ops->ndo_start_xmit(nskb, dev); - if (unlikely(rc)) { - nskb->next = skb->next; - skb->next = nskb; - return rc; - } - if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) - return NETDEV_TX_BUSY; - } while (skb->next); - - skb->destructor = DEV_GSO_CB(skb)->destructor; - -out_kfree_skb: - kfree_skb(skb); - return 0; -} - -static u32 simple_tx_hashrnd; -static int simple_tx_hashrnd_initialized = 0; - -static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) -{ - u32 addr1, addr2, ports; - u32 hash, ihl; - u8 ip_proto = 0; - - if (unlikely(!simple_tx_hashrnd_initialized)) { - get_random_bytes(&simple_tx_hashrnd, 4); - simple_tx_hashrnd_initialized = 1; - } - - switch (skb->protocol) { - case htons(ETH_P_IP): - if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))) - ip_proto = ip_hdr(skb)->protocol; - addr1 = ip_hdr(skb)->saddr; - addr2 = ip_hdr(skb)->daddr; - ihl = ip_hdr(skb)->ihl; - break; - case htons(ETH_P_IPV6): - ip_proto = ipv6_hdr(skb)->nexthdr; - addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3]; - addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3]; - ihl = (40 >> 2); - break; - default: - return 0; - } - - - switch (ip_proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - case IPPROTO_AH: - case IPPROTO_SCTP: - case IPPROTO_UDPLITE: - ports = *((u32 *) (skb_network_header(skb) + (ihl * 4))); - break; - - default: - ports = 0; - break; - } - - hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd); - - return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); -} - -static struct netdev_queue *dev_pick_tx(struct net_device *dev, - struct sk_buff *skb) -{ - const struct net_device_ops *ops = dev->netdev_ops; - u16 queue_index = 0; - - if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb); - else if (dev->real_num_tx_queues > 1) - queue_index = simple_tx_hash(dev, skb); - - skb_set_queue_mapping(skb, queue_index); - return netdev_get_tx_queue(dev, queue_index); -} - -/** - * dev_queue_xmit - transmit a buffer - * @skb: buffer to transmit - * - * Queue a buffer for transmission to a network device. The caller must - * have set the device and priority and built the buffer before calling - * this function. The function can be called from an interrupt. - * - * A negative errno code is returned on a failure. A success does not - * guarantee the frame will be transmitted as it may be dropped due - * to congestion or traffic shaping. - * - * ----------------------------------------------------------------------------------- - * I notice this method can also return errors from the queue disciplines, - * including NET_XMIT_DROP, which is a positive value. So, errors can also - * be positive. - * - * Regardless of the return value, the skb is consumed, so it is currently - * difficult to retry a send to this method. (You can bump the ref count - * before sending to hold a reference for retry if you are careful.) - * - * When calling this method, interrupts MUST be enabled. This is because - * the BH enable code must have IRQs enabled so that it will not deadlock. - * --BLG - */ -int dev_queue_xmit(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct netdev_queue *txq; - struct Qdisc *q; - int rc = -ENOMEM; - - /* GSO will handle the following emulations directly. */ - if (netif_needs_gso(dev, skb)) - goto gso; - - if (skb_shinfo(skb)->frag_list && - !(dev->features & NETIF_F_FRAGLIST) && - __skb_linearize(skb)) - goto out_kfree_skb; - - /* Fragmented skb is linearized if device does not support SG, - * or if at least one of fragments is in highmem and device - * does not support DMA from it. - */ - if (skb_shinfo(skb)->nr_frags && - (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && - __skb_linearize(skb)) - goto out_kfree_skb; - - /* If packet is not checksummed and device does not support - * checksumming for this protocol, complete checksumming here. - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - skb_set_transport_header(skb, skb->csum_start - - skb_headroom(skb)); - if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb)) - goto out_kfree_skb; - } - -gso: - /* Disable soft irqs for various locks below. Also - * stops preemption for RCU. - */ - rcu_read_lock_bh(); - - txq = dev_pick_tx(dev, skb); - q = rcu_dereference(txq->qdisc); - -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); -#endif - if (q->enqueue) { - spinlock_t *root_lock = qdisc_lock(q); - - spin_lock(root_lock); - - if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - kfree_skb(skb); - rc = NET_XMIT_DROP; - } else { - rc = qdisc_enqueue_root(skb, q); - qdisc_run(q); - } - spin_unlock(root_lock); - - goto out; - } - - /* The device has no queue. Common case for software devices: - loopback, all the sorts of tunnels... - - Really, it is unlikely that netif_tx_lock protection is necessary - here. (f.e. loopback and IP tunnels are clean ignoring statistics - counters.) - However, it is possible, that they rely on protection - made by us here. - - Check this and shot the lock. It is not prone from deadlocks. - Either shot noqueue qdisc, it is even simpler 8) - */ - if (dev->flags & IFF_UP) { - int cpu = smp_processor_id(); /* ok because BHs are off */ - - if (txq->xmit_lock_owner != cpu) { - - HARD_TX_LOCK(dev, txq, cpu); - - if (!netif_tx_queue_stopped(txq)) { - rc = 0; - if (!dev_hard_start_xmit(skb, dev, txq)) { - HARD_TX_UNLOCK(dev, txq); - goto out; - } - } - HARD_TX_UNLOCK(dev, txq); - if (net_ratelimit()) - printk(KERN_CRIT "Virtual device %s asks to " - "queue packet!\n", dev->name); - } else { - /* Recursion is detected! It is possible, - * unfortunately */ - if (net_ratelimit()) - printk(KERN_CRIT "Dead loop on virtual device " - "%s, fix it urgently!\n", dev->name); - } - } - - rc = -ENETDOWN; - rcu_read_unlock_bh(); - -out_kfree_skb: - kfree_skb(skb); - return rc; -out: - rcu_read_unlock_bh(); - return rc; -} - - -/*======================================================================= - Receiver routines - =======================================================================*/ - -int netdev_max_backlog __read_mostly = 1000; -int netdev_budget __read_mostly = 300; -int weight_p __read_mostly = 64; /* old backlog weight */ - -DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; - - -/** - * netif_rx - post buffer to the network code - * @skb: buffer to post - * - * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. - * - * return values: - * NET_RX_SUCCESS (no congestion) - * NET_RX_DROP (packet was dropped) - * - */ - -int netif_rx(struct sk_buff *skb) -{ -#ifndef DDE_LINUX - struct softnet_data *queue; - unsigned long flags; - - /* if netpoll wants it, pretend we never saw it */ - if (netpoll_rx(skb)) - return NET_RX_DROP; - - if (!skb->tstamp.tv64) - net_timestamp(skb); - - /* - * The code is rearranged so that the path is the most - * short when CPU is congested, but is still operating. - */ - local_irq_save(flags); - queue = &__get_cpu_var(softnet_data); - - __get_cpu_var(netdev_rx_stat).total++; - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { - if (queue->input_pkt_queue.qlen) { -enqueue: - dev_hold(skb->dev); - __skb_queue_tail(&queue->input_pkt_queue, skb); - local_irq_restore(flags); - return NET_RX_SUCCESS; - } - - napi_schedule(&queue->backlog); - goto enqueue; - } - - __get_cpu_var(netdev_rx_stat).dropped++; - local_irq_restore(flags); - - kfree_skb(skb); - return NET_RX_DROP; -#else /* DDE_LINUX */ - /* call our callback fn */ - return l4dde26_do_rx_callback(skb); -#endif -} - -int netif_rx_ni(struct sk_buff *skb) -{ - int err; - - preempt_disable(); - err = netif_rx(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); - - return err; -} - -EXPORT_SYMBOL(netif_rx_ni); - -static void net_tx_action(struct softirq_action *h) -{ - struct softnet_data *sd = &__get_cpu_var(softnet_data); - - if (sd->completion_queue) { - struct sk_buff *clist; - - local_irq_disable(); - clist = sd->completion_queue; - sd->completion_queue = NULL; - local_irq_enable(); - - while (clist) { - struct sk_buff *skb = clist; - clist = clist->next; - - WARN_ON(atomic_read(&skb->users)); - __kfree_skb(skb); - } - } - - if (sd->output_queue) { - struct Qdisc *head; - - local_irq_disable(); - head = sd->output_queue; - sd->output_queue = NULL; - local_irq_enable(); - - while (head) { - struct Qdisc *q = head; - spinlock_t *root_lock; - - head = head->next_sched; - - root_lock = qdisc_lock(q); - if (spin_trylock(root_lock)) { - smp_mb__before_clear_bit(); - clear_bit(__QDISC_STATE_SCHED, - &q->state); - qdisc_run(q); - spin_unlock(root_lock); - } else { - if (!test_bit(__QDISC_STATE_DEACTIVATED, - &q->state)) { - __netif_reschedule(q); - } else { - smp_mb__before_clear_bit(); - clear_bit(__QDISC_STATE_SCHED, - &q->state); - } - } - } - } -} - -static inline int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev, - struct net_device *orig_dev) -{ - atomic_inc(&skb->users); - return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); -} - -#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) -/* These hooks defined here for ATM */ -struct net_bridge; -struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, - unsigned char *addr); -void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly; - -/* - * If bridge module is loaded call bridging hook. - * returns NULL if packet was consumed. - */ -struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, - struct sk_buff *skb) __read_mostly; -static inline struct sk_buff *handle_bridge(struct sk_buff *skb, - struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) -{ - struct net_bridge_port *port; - - if (skb->pkt_type == PACKET_LOOPBACK || - (port = rcu_dereference(skb->dev->br_port)) == NULL) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - return br_handle_frame_hook(port, skb); -} -#else -#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb) -#endif - -#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) -struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly; -EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook); - -static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, - struct packet_type **pt_prev, - int *ret, - struct net_device *orig_dev) -{ - if (skb->dev->macvlan_port == NULL) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - return macvlan_handle_frame_hook(skb); -} -#else -#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb) -#endif - -#ifdef CONFIG_NET_CLS_ACT -/* TODO: Maybe we should just force sch_ingress to be compiled in - * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions - * a compare and 2 stores extra right now if we dont have it on - * but have CONFIG_NET_CLS_ACT - * NOTE: This doesnt stop any functionality; if you dont have - * the ingress scheduler, you just cant add policies on ingress. - * - */ -static int ing_filter(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - u32 ttl = G_TC_RTTL(skb->tc_verd); - struct netdev_queue *rxq; - int result = TC_ACT_OK; - struct Qdisc *q; - - if (MAX_RED_LOOP < ttl++) { - printk(KERN_WARNING - "Redir loop detected Dropping packet (%d->%d)\n", - skb->iif, dev->ifindex); - return TC_ACT_SHOT; - } - - skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - - rxq = &dev->rx_queue; - - q = rxq->qdisc; - if (q != &noop_qdisc) { - spin_lock(qdisc_lock(q)); - if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - result = qdisc_enqueue_root(skb, q); - spin_unlock(qdisc_lock(q)); - } - - return result; -} - -static inline struct sk_buff *handle_ing(struct sk_buff *skb, - struct packet_type **pt_prev, - int *ret, struct net_device *orig_dev) -{ - if (skb->dev->rx_queue.qdisc == &noop_qdisc) - goto out; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } else { - /* Huh? Why does turning on AF_PACKET affect this? */ - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); - } - - switch (ing_filter(skb)) { - case TC_ACT_SHOT: - case TC_ACT_STOLEN: - kfree_skb(skb); - return NULL; - } - -out: - skb->tc_verd = 0; - return skb; -} -#endif - -/* - * netif_nit_deliver - deliver received packets to network taps - * @skb: buffer - * - * This function is used to deliver incoming packets to network - * taps. It should be used when the normal netif_receive_skb path - * is bypassed, for example because of VLAN acceleration. - */ -void netif_nit_deliver(struct sk_buff *skb) -{ - struct packet_type *ptype; - - if (list_empty(&ptype_all)) - return; - - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - skb->mac_len = skb->network_header - skb->mac_header; - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) - deliver_skb(skb, ptype, skb->dev); - } - rcu_read_unlock(); -} - -/** - * netif_receive_skb - process receive buffer from network - * @skb: buffer to process - * - * netif_receive_skb() is the main receive data processing function. - * It always succeeds. The buffer may be dropped during processing - * for congestion control or by the protocol layers. - * - * This function may only be called from softirq context and interrupts - * should be enabled. - * - * Return values (usually ignored): - * NET_RX_SUCCESS: no congestion - * NET_RX_DROP: packet was dropped - */ -int netif_receive_skb(struct sk_buff *skb) -{ -#ifndef DDE_LINUX - struct packet_type *ptype, *pt_prev; - struct net_device *orig_dev; - struct net_device *null_or_orig; - int ret = NET_RX_DROP; - __be16 type; - - if (skb->vlan_tci && vlan_hwaccel_do_receive(skb)) - return NET_RX_SUCCESS; - - /* if we've gotten here through NAPI, check netpoll */ - if (netpoll_receive_skb(skb)) - return NET_RX_DROP; - - if (!skb->tstamp.tv64) - net_timestamp(skb); - - if (!skb->iif) - skb->iif = skb->dev->ifindex; - - null_or_orig = NULL; - orig_dev = skb->dev; - if (orig_dev->master) { - if (skb_bond_should_drop(skb)) - null_or_orig = orig_dev; /* deliver only exact match */ - else - skb->dev = orig_dev->master; - } - - __get_cpu_var(netdev_rx_stat).total++; - - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - skb->mac_len = skb->network_header - skb->mac_header; - - pt_prev = NULL; - - rcu_read_lock(); - -#ifdef CONFIG_NET_CLS_ACT - if (skb->tc_verd & TC_NCLS) { - skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); - goto ncls; - } -#endif - - list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (ptype->dev == null_or_orig || ptype->dev == skb->dev || - ptype->dev == orig_dev) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = ptype; - } - } - -#ifdef CONFIG_NET_CLS_ACT - skb = handle_ing(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto out; -ncls: -#endif - - skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto out; - skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto out; - - type = skb->protocol; - list_for_each_entry_rcu(ptype, - &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && - (ptype->dev == null_or_orig || ptype->dev == skb->dev || - ptype->dev == orig_dev)) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = ptype; - } - } - - if (pt_prev) { - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); - } else { - kfree_skb(skb); - /* Jamal, now you will not able to escape explaining - * me how you were going to use this. :-) - */ - ret = NET_RX_DROP; - } - -out: - rcu_read_unlock(); - return ret; -#else /* DDE_LINUX */ - /* call our callback fn */ - return l4dde26_do_rx_callback(skb); -#endif -} - - -/* Network device is going away, flush any packets still pending */ -static void flush_backlog(void *arg) -{ - struct net_device *dev = arg; - struct softnet_data *queue = &__get_cpu_var(softnet_data); - struct sk_buff *skb, *tmp; - - skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) - if (skb->dev == dev) { - __skb_unlink(skb, &queue->input_pkt_queue); - kfree_skb(skb); - } -} - -static int napi_gro_complete(struct sk_buff *skb) -{ - struct packet_type *ptype; - __be16 type = skb->protocol; - struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; - int err = -ENOENT; - - if (NAPI_GRO_CB(skb)->count == 1) - goto out; - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || ptype->dev || !ptype->gro_complete) - continue; - - err = ptype->gro_complete(skb); - break; - } - rcu_read_unlock(); - - if (err) { - WARN_ON(&ptype->list == head); - kfree_skb(skb); - return NET_RX_SUCCESS; - } - -out: - skb_shinfo(skb)->gso_size = 0; - __skb_push(skb, -skb_network_offset(skb)); - return netif_receive_skb(skb); -} - -void napi_gro_flush(struct napi_struct *napi) -{ - struct sk_buff *skb, *next; - - for (skb = napi->gro_list; skb; skb = next) { - next = skb->next; - skb->next = NULL; - napi_gro_complete(skb); - } - - napi->gro_list = NULL; -} -EXPORT_SYMBOL(napi_gro_flush); - -int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ - struct sk_buff **pp = NULL; - struct packet_type *ptype; - __be16 type = skb->protocol; - struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; - int count = 0; - int same_flow; - int mac_len; - int free; - - if (!(skb->dev->features & NETIF_F_GRO)) - goto normal; - - if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list) - goto normal; - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - struct sk_buff *p; - - if (ptype->type != type || ptype->dev || !ptype->gro_receive) - continue; - - skb_reset_network_header(skb); - mac_len = skb->network_header - skb->mac_header; - skb->mac_len = mac_len; - NAPI_GRO_CB(skb)->same_flow = 0; - NAPI_GRO_CB(skb)->flush = 0; - NAPI_GRO_CB(skb)->free = 0; - - for (p = napi->gro_list; p; p = p->next) { - count++; - - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - if (p->mac_len != mac_len || - memcmp(skb_mac_header(p), skb_mac_header(skb), - mac_len)) - NAPI_GRO_CB(p)->same_flow = 0; - } - - pp = ptype->gro_receive(&napi->gro_list, skb); - break; - } - rcu_read_unlock(); - - if (&ptype->list == head) - goto normal; - - same_flow = NAPI_GRO_CB(skb)->same_flow; - free = NAPI_GRO_CB(skb)->free; - - if (pp) { - struct sk_buff *nskb = *pp; - - *pp = nskb->next; - nskb->next = NULL; - napi_gro_complete(nskb); - count--; - } - - if (same_flow) - goto ok; - - if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) { - __skb_push(skb, -skb_network_offset(skb)); - goto normal; - } - - NAPI_GRO_CB(skb)->count = 1; - skb_shinfo(skb)->gso_size = skb->len; - skb->next = napi->gro_list; - napi->gro_list = skb; - -ok: - return free; - -normal: - return -1; -} -EXPORT_SYMBOL(dev_gro_receive); - -static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ - struct sk_buff *p; - - for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = 1; - NAPI_GRO_CB(p)->flush = 0; - } - - return dev_gro_receive(napi, skb); -} - -int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ - if (netpoll_receive_skb(skb)) - return NET_RX_DROP; - - switch (__napi_gro_receive(napi, skb)) { - case -1: - return netif_receive_skb(skb); - - case 1: - kfree_skb(skb); - break; - } - - return NET_RX_SUCCESS; -} -EXPORT_SYMBOL(napi_gro_receive); - -void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) -{ - __skb_pull(skb, skb_headlen(skb)); - skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); - - napi->skb = skb; -} -EXPORT_SYMBOL(napi_reuse_skb); - -struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, - struct napi_gro_fraginfo *info) -{ - struct net_device *dev = napi->dev; - struct sk_buff *skb = napi->skb; - - napi->skb = NULL; - - if (!skb) { - skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); - if (!skb) - goto out; - - skb_reserve(skb, NET_IP_ALIGN); - } - - BUG_ON(info->nr_frags > MAX_SKB_FRAGS); - skb_shinfo(skb)->nr_frags = info->nr_frags; - memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags)); - - skb->data_len = info->len; - skb->len += info->len; - skb->truesize += info->len; - - if (!pskb_may_pull(skb, ETH_HLEN)) { - napi_reuse_skb(napi, skb); - skb = NULL; - goto out; - } - - skb->protocol = eth_type_trans(skb, dev); - - skb->ip_summed = info->ip_summed; - skb->csum = info->csum; - -out: - return skb; -} -EXPORT_SYMBOL(napi_fraginfo_skb); - -int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info) -{ - struct sk_buff *skb = napi_fraginfo_skb(napi, info); - int err = NET_RX_DROP; - - if (!skb) - goto out; - - if (netpoll_receive_skb(skb)) - goto out; - - err = NET_RX_SUCCESS; - - switch (__napi_gro_receive(napi, skb)) { - case -1: - return netif_receive_skb(skb); - - case 0: - goto out; - } - - napi_reuse_skb(napi, skb); - -out: - return err; -} -EXPORT_SYMBOL(napi_gro_frags); - -static int process_backlog(struct napi_struct *napi, int quota) -{ - int work = 0; - struct softnet_data *queue = &__get_cpu_var(softnet_data); - unsigned long start_time = jiffies; - - napi->weight = weight_p; - do { - struct sk_buff *skb; - - local_irq_disable(); - skb = __skb_dequeue(&queue->input_pkt_queue); - if (!skb) { - local_irq_enable(); - napi_complete(napi); - goto out; - } - local_irq_enable(); - - napi_gro_receive(napi, skb); - } while (++work < quota && jiffies == start_time); - - napi_gro_flush(napi); - -out: - return work; -} - -/** - * __napi_schedule - schedule for receive - * @n: entry to schedule - * - * The entry's receive function will be scheduled to run - */ -void __napi_schedule(struct napi_struct *n) -{ - unsigned long flags; - - local_irq_save(flags); - list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - local_irq_restore(flags); -} -EXPORT_SYMBOL(__napi_schedule); - -void __napi_complete(struct napi_struct *n) -{ - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - BUG_ON(n->gro_list); - - list_del(&n->poll_list); - smp_mb__before_clear_bit(); - clear_bit(NAPI_STATE_SCHED, &n->state); -} -EXPORT_SYMBOL(__napi_complete); - -void napi_complete(struct napi_struct *n) -{ - unsigned long flags; - - /* - * don't let napi dequeue from the cpu poll list - * just in case its running on a different cpu - */ - if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) - return; - - napi_gro_flush(n); - local_irq_save(flags); - __napi_complete(n); - local_irq_restore(flags); -} -EXPORT_SYMBOL(napi_complete); - -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) -{ - INIT_LIST_HEAD(&napi->poll_list); - napi->gro_list = NULL; - napi->skb = NULL; - napi->poll = poll; - napi->weight = weight; - list_add(&napi->dev_list, &dev->napi_list); - napi->dev = dev; -#ifdef CONFIG_NETPOLL - spin_lock_init(&napi->poll_lock); - napi->poll_owner = -1; -#endif - set_bit(NAPI_STATE_SCHED, &napi->state); -} -EXPORT_SYMBOL(netif_napi_add); - -void netif_napi_del(struct napi_struct *napi) -{ - struct sk_buff *skb, *next; - - list_del_init(&napi->dev_list); - kfree_skb(napi->skb); - - for (skb = napi->gro_list; skb; skb = next) { - next = skb->next; - skb->next = NULL; - kfree_skb(skb); - } - - napi->gro_list = NULL; -} -EXPORT_SYMBOL(netif_napi_del); - - -static void net_rx_action(struct softirq_action *h) -{ - struct list_head *list = &__get_cpu_var(softnet_data).poll_list; - unsigned long time_limit = jiffies + 2; - int budget = netdev_budget; - void *have; - - local_irq_disable(); - - while (!list_empty(list)) { - struct napi_struct *n; - int work, weight; - - /* If softirq window is exhuasted then punt. - * Allow this to run for 2 jiffies since which will allow - * an average latency of 1.5/HZ. - */ - if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) - goto softnet_break; - - local_irq_enable(); - - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - n = list_entry(list->next, struct napi_struct, poll_list); - - have = netpoll_poll_lock(n); - - weight = n->weight; - - /* This NAPI_STATE_SCHED test is for avoiding a race - * with netpoll's poll_napi(). Only the entity which - * obtains the lock and sees NAPI_STATE_SCHED set will - * actually make the ->poll() call. Therefore we avoid - * accidently calling ->poll() when NAPI is not scheduled. - */ - work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) - work = n->poll(n, weight); - - WARN_ON_ONCE(work > weight); - - budget -= work; - - local_irq_disable(); - - /* Drivers must not modify the NAPI state if they - * consume the entire weight. In such cases this code - * still "owns" the NAPI instance and therefore can - * move the instance around on the list at-will. - */ - if (unlikely(work == weight)) { - if (unlikely(napi_disable_pending(n))) - __napi_complete(n); - else - list_move_tail(&n->poll_list, list); - } - - netpoll_poll_unlock(have); - } -out: - local_irq_enable(); - -#ifdef CONFIG_NET_DMA - /* - * There may not be any more sk_buffs coming right now, so push - * any pending DMA copies to hardware - */ - dma_issue_pending_all(); -#endif - - return; - -softnet_break: - __get_cpu_var(netdev_rx_stat).time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - goto out; -} - -static gifconf_func_t * gifconf_list [NPROTO]; - -/** - * register_gifconf - register a SIOCGIF handler - * @family: Address family - * @gifconf: Function handler - * - * Register protocol dependent address dumping routines. The handler - * that is passed must not be freed or reused until it has been replaced - * by another handler. - */ -int register_gifconf(unsigned int family, gifconf_func_t * gifconf) -{ - if (family >= NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; -} - - -/* - * Map an interface index to its name (SIOCGIFNAME) - */ - -/* - * We need this ioctl for efficient implementation of the - * if_indextoname() function required by the IPv6 API. Without - * it, we would have to search all the interfaces to find a - * match. --pb - */ - -static int dev_ifname(struct net *net, struct ifreq __user *arg) -{ - struct net_device *dev; - struct ifreq ifr; - - /* - * Fetch the caller's info block. - */ - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifr.ifr_ifindex); - if (!dev) { - read_unlock(&dev_base_lock); - return -ENODEV; - } - - strcpy(ifr.ifr_name, dev->name); - read_unlock(&dev_base_lock); - - if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return 0; -} - -/* - * Perform a SIOCGIFCONF call. This structure will change - * size eventually, and there is nothing I can do about it. - * Thus we will need a 'compatibility mode'. - */ - -static int dev_ifconf(struct net *net, char __user *arg) -{ - struct ifconf ifc; - struct net_device *dev; - char __user *pos; - int len; - int total; - int i; - - /* - * Fetch the caller's info block. - */ - - if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) - return -EFAULT; - - pos = ifc.ifc_buf; - len = ifc.ifc_len; - - /* - * Loop over the interfaces, and write an info block for each. - */ - - total = 0; - for_each_netdev(net, dev) { - for (i = 0; i < NPROTO; i++) { - if (gifconf_list[i]) { - int done; - if (!pos) - done = gifconf_list[i](dev, NULL, 0); - else - done = gifconf_list[i](dev, pos + total, - len - total); - if (done < 0) - return -EFAULT; - total += done; - } - } - } - - /* - * All done. Write the updated control block back to the caller. - */ - ifc.ifc_len = total; - - /* - * Both BSD and Solaris return 0 here, so we do too. - */ - return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; -} - -#ifdef CONFIG_PROC_FS -/* - * This is invoked by the /proc filesystem handler to display a device - * in detail. - */ -void *dev_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(dev_base_lock) -{ - struct net *net = seq_file_net(seq); - loff_t off; - struct net_device *dev; - - read_lock(&dev_base_lock); - if (!*pos) - return SEQ_START_TOKEN; - - off = 1; - for_each_netdev(net, dev) - if (off++ == *pos) - return dev; - - return NULL; -} - -void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct net *net = seq_file_net(seq); - ++*pos; - return v == SEQ_START_TOKEN ? - first_net_device(net) : next_net_device((struct net_device *)v); -} - -void dev_seq_stop(struct seq_file *seq, void *v) - __releases(dev_base_lock) -{ - read_unlock(&dev_base_lock); -} - -static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) -{ - const struct net_device_stats *stats = dev_get_stats(dev); - - seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " - "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", - dev->name, stats->rx_bytes, stats->rx_packets, - stats->rx_errors, - stats->rx_dropped + stats->rx_missed_errors, - stats->rx_fifo_errors, - stats->rx_length_errors + stats->rx_over_errors + - stats->rx_crc_errors + stats->rx_frame_errors, - stats->rx_compressed, stats->multicast, - stats->tx_bytes, stats->tx_packets, - stats->tx_errors, stats->tx_dropped, - stats->tx_fifo_errors, stats->collisions, - stats->tx_carrier_errors + - stats->tx_aborted_errors + - stats->tx_window_errors + - stats->tx_heartbeat_errors, - stats->tx_compressed); -} - -/* - * Called from the PROCfs module. This now uses the new arbitrary sized - * /proc/net interface to create /proc/net/dev - */ -static int dev_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "Inter-| Receive " - " | Transmit\n" - " face |bytes packets errs drop fifo frame " - "compressed multicast|bytes packets errs " - "drop fifo colls carrier compressed\n"); - else - dev_seq_printf_stats(seq, v); - return 0; -} - -static struct netif_rx_stats *softnet_get_online(loff_t *pos) -{ - struct netif_rx_stats *rc = NULL; - - while (*pos < nr_cpu_ids) - if (cpu_online(*pos)) { - rc = &per_cpu(netdev_rx_stat, *pos); - break; - } else - ++*pos; - return rc; -} - -static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) -{ - return softnet_get_online(pos); -} - -static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - return softnet_get_online(pos); -} - -static void softnet_seq_stop(struct seq_file *seq, void *v) -{ -} - -static int softnet_seq_show(struct seq_file *seq, void *v) -{ - struct netif_rx_stats *s = v; - - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - s->total, s->dropped, s->time_squeeze, 0, - 0, 0, 0, 0, /* was fastroute */ - s->cpu_collision ); - return 0; -} - -static const struct seq_operations dev_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, - .show = dev_seq_show, -}; - -static int dev_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &dev_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations dev_seq_fops = { - .owner = THIS_MODULE, - .open = dev_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -static const struct seq_operations softnet_seq_ops = { - .start = softnet_seq_start, - .next = softnet_seq_next, - .stop = softnet_seq_stop, - .show = softnet_seq_show, -}; - -static int softnet_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &softnet_seq_ops); -} - -static const struct file_operations softnet_seq_fops = { - .owner = THIS_MODULE, - .open = softnet_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static void *ptype_get_idx(loff_t pos) -{ - struct packet_type *pt = NULL; - loff_t i = 0; - int t; - - list_for_each_entry_rcu(pt, &ptype_all, list) { - if (i == pos) - return pt; - ++i; - } - - for (t = 0; t < PTYPE_HASH_SIZE; t++) { - list_for_each_entry_rcu(pt, &ptype_base[t], list) { - if (i == pos) - return pt; - ++i; - } - } - return NULL; -} - -static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) -{ - rcu_read_lock(); - return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; -} - -static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct packet_type *pt; - struct list_head *nxt; - int hash; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ptype_get_idx(0); - - pt = v; - nxt = pt->list.next; - if (pt->type == htons(ETH_P_ALL)) { - if (nxt != &ptype_all) - goto found; - hash = 0; - nxt = ptype_base[0].next; - } else - hash = ntohs(pt->type) & PTYPE_HASH_MASK; - - while (nxt == &ptype_base[hash]) { - if (++hash >= PTYPE_HASH_SIZE) - return NULL; - nxt = ptype_base[hash].next; - } -found: - return list_entry(nxt, struct packet_type, list); -} - -static void ptype_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} - -static int ptype_seq_show(struct seq_file *seq, void *v) -{ - struct packet_type *pt = v; - - if (v == SEQ_START_TOKEN) - seq_puts(seq, "Type Device Function\n"); - else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { - if (pt->type == htons(ETH_P_ALL)) - seq_puts(seq, "ALL "); - else - seq_printf(seq, "%04x", ntohs(pt->type)); - - seq_printf(seq, " %-8s %pF\n", - pt->dev ? pt->dev->name : "", pt->func); - } - - return 0; -} - -static const struct seq_operations ptype_seq_ops = { - .start = ptype_seq_start, - .next = ptype_seq_next, - .stop = ptype_seq_stop, - .show = ptype_seq_show, -}; - -static int ptype_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &ptype_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations ptype_seq_fops = { - .owner = THIS_MODULE, - .open = ptype_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - - -static int __net_init dev_proc_net_init(struct net *net) -{ - int rc = -ENOMEM; - - if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) - goto out; - if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) - goto out_dev; - if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) - goto out_softnet; - - if (wext_proc_init(net)) - goto out_ptype; - rc = 0; -out: - return rc; -out_ptype: - proc_net_remove(net, "ptype"); -out_softnet: - proc_net_remove(net, "softnet_stat"); -out_dev: - proc_net_remove(net, "dev"); - goto out; -} - -static void __net_exit dev_proc_net_exit(struct net *net) -{ - wext_proc_exit(net); - - proc_net_remove(net, "ptype"); - proc_net_remove(net, "softnet_stat"); - proc_net_remove(net, "dev"); -} - -static struct pernet_operations __net_initdata dev_proc_ops = { - .init = dev_proc_net_init, - .exit = dev_proc_net_exit, -}; - -static int __init dev_proc_init(void) -{ - return register_pernet_subsys(&dev_proc_ops); -} -#else -#define dev_proc_init() 0 -#endif /* CONFIG_PROC_FS */ - - -/** - * netdev_set_master - set up master/slave pair - * @slave: slave device - * @master: new master device - * - * Changes the master device of the slave. Pass %NULL to break the - * bonding. The caller must hold the RTNL semaphore. On a failure - * a negative errno code is returned. On success the reference counts - * are adjusted, %RTM_NEWLINK is sent to the routing socket and the - * function returns zero. - */ -int netdev_set_master(struct net_device *slave, struct net_device *master) -{ - struct net_device *old = slave->master; - - ASSERT_RTNL(); - - if (master) { - if (old) - return -EBUSY; - dev_hold(master); - } - - slave->master = master; - - synchronize_net(); - - if (old) - dev_put(old); - - if (master) - slave->flags |= IFF_SLAVE; - else - slave->flags &= ~IFF_SLAVE; - - rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); - return 0; -} - -static void dev_change_rx_flags(struct net_device *dev, int flags) -{ - const struct net_device_ops *ops = dev->netdev_ops; - - if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) - ops->ndo_change_rx_flags(dev, flags); -} - -static int __dev_set_promiscuity(struct net_device *dev, int inc) -{ - unsigned short old_flags = dev->flags; - uid_t uid; - gid_t gid; - - ASSERT_RTNL(); - - dev->flags |= IFF_PROMISC; - dev->promiscuity += inc; - if (dev->promiscuity == 0) { - /* - * Avoid overflow. - * If inc causes overflow, untouch promisc and return error. - */ - if (inc < 0) - dev->flags &= ~IFF_PROMISC; - else { - dev->promiscuity -= inc; - printk(KERN_WARNING "%s: promiscuity touches roof, " - "set promiscuity failed, promiscuity feature " - "of device might be broken.\n", dev->name); - return -EOVERFLOW; - } - } - if (dev->flags != old_flags) { - printk(KERN_INFO "device %s %s promiscuous mode\n", - dev->name, (dev->flags & IFF_PROMISC) ? "entered" : - "left"); - if (audit_enabled) { - current_uid_gid(&uid, &gid); - audit_log(current->audit_context, GFP_ATOMIC, - AUDIT_ANOM_PROMISCUOUS, - "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", - dev->name, (dev->flags & IFF_PROMISC), - (old_flags & IFF_PROMISC), - audit_get_loginuid(current), - uid, gid, - audit_get_sessionid(current)); - } - - dev_change_rx_flags(dev, IFF_PROMISC); - } - return 0; -} - -/** - * dev_set_promiscuity - update promiscuity count on a device - * @dev: device - * @inc: modifier - * - * Add or remove promiscuity from a device. While the count in the device - * remains above zero the interface remains promiscuous. Once it hits zero - * the device reverts back to normal filtering operation. A negative inc - * value is used to drop promiscuity on the device. - * Return 0 if successful or a negative errno code on error. - */ -int dev_set_promiscuity(struct net_device *dev, int inc) -{ - unsigned short old_flags = dev->flags; - int err; - - err = __dev_set_promiscuity(dev, inc); - if (err < 0) - return err; - if (dev->flags != old_flags) - dev_set_rx_mode(dev); - return err; -} - -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) -{ - unsigned short old_flags = dev->flags; - - ASSERT_RTNL(); - - dev->flags |= IFF_ALLMULTI; - dev->allmulti += inc; - if (dev->allmulti == 0) { - /* - * Avoid overflow. - * If inc causes overflow, untouch allmulti and return error. - */ - if (inc < 0) - dev->flags &= ~IFF_ALLMULTI; - else { - dev->allmulti -= inc; - printk(KERN_WARNING "%s: allmulti touches roof, " - "set allmulti failed, allmulti feature of " - "device might be broken.\n", dev->name); - return -EOVERFLOW; - } - } - if (dev->flags ^ old_flags) { - dev_change_rx_flags(dev, IFF_ALLMULTI); - dev_set_rx_mode(dev); - } - return 0; -} - -/* - * Upload unicast and multicast address lists to device and - * configure RX filtering. When the device doesn't support unicast - * filtering it is put in promiscuous mode while unicast addresses - * are present. - */ -void __dev_set_rx_mode(struct net_device *dev) -{ - const struct net_device_ops *ops = dev->netdev_ops; - - /* dev_open will call this function so the list will stay sane. */ - if (!(dev->flags&IFF_UP)) - return; - - if (!netif_device_present(dev)) - return; - - if (ops->ndo_set_rx_mode) - ops->ndo_set_rx_mode(dev); - else { - /* Unicast addresses changes may only happen under the rtnl, - * therefore calling __dev_set_promiscuity here is safe. - */ - if (dev->uc_count > 0 && !dev->uc_promisc) { - __dev_set_promiscuity(dev, 1); - dev->uc_promisc = 1; - } else if (dev->uc_count == 0 && dev->uc_promisc) { - __dev_set_promiscuity(dev, -1); - dev->uc_promisc = 0; - } - - if (ops->ndo_set_multicast_list) - ops->ndo_set_multicast_list(dev); - } -} - -void dev_set_rx_mode(struct net_device *dev) -{ - netif_addr_lock_bh(dev); - __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); -} - -int __dev_addr_delete(struct dev_addr_list **list, int *count, - void *addr, int alen, int glbl) -{ - struct dev_addr_list *da; - - for (; (da = *list) != NULL; list = &da->next) { - if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && - alen == da->da_addrlen) { - if (glbl) { - int old_glbl = da->da_gusers; - da->da_gusers = 0; - if (old_glbl == 0) - break; - } - if (--da->da_users) - return 0; - - *list = da->next; - kfree(da); - (*count)--; - return 0; - } - } - return -ENOENT; -} - -int __dev_addr_add(struct dev_addr_list **list, int *count, - void *addr, int alen, int glbl) -{ - struct dev_addr_list *da; - - for (da = *list; da != NULL; da = da->next) { - if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && - da->da_addrlen == alen) { - if (glbl) { - int old_glbl = da->da_gusers; - da->da_gusers = 1; - if (old_glbl) - return 0; - } - da->da_users++; - return 0; - } - } - - da = kzalloc(sizeof(*da), GFP_ATOMIC); - if (da == NULL) - return -ENOMEM; - memcpy(da->da_addr, addr, alen); - da->da_addrlen = alen; - da->da_users = 1; - da->da_gusers = glbl ? 1 : 0; - da->next = *list; - *list = da; - (*count)++; - return 0; -} - -/** - * dev_unicast_delete - Release secondary unicast address. - * @dev: device - * @addr: address to delete - * @alen: length of @addr - * - * Release reference to a secondary unicast address and remove it - * from the device if the reference count drops to zero. - * - * The caller must hold the rtnl_mutex. - */ -int dev_unicast_delete(struct net_device *dev, void *addr, int alen) -{ - int err; - - ASSERT_RTNL(); - - netif_addr_lock_bh(dev); - err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0); - if (!err) - __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); - return err; -} -EXPORT_SYMBOL(dev_unicast_delete); - -/** - * dev_unicast_add - add a secondary unicast address - * @dev: device - * @addr: address to add - * @alen: length of @addr - * - * Add a secondary unicast address to the device or increase - * the reference count if it already exists. - * - * The caller must hold the rtnl_mutex. - */ -int dev_unicast_add(struct net_device *dev, void *addr, int alen) -{ - int err; - - ASSERT_RTNL(); - - netif_addr_lock_bh(dev); - err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0); - if (!err) - __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); - return err; -} -EXPORT_SYMBOL(dev_unicast_add); - -int __dev_addr_sync(struct dev_addr_list **to, int *to_count, - struct dev_addr_list **from, int *from_count) -{ - struct dev_addr_list *da, *next; - int err = 0; - - da = *from; - while (da != NULL) { - next = da->next; - if (!da->da_synced) { - err = __dev_addr_add(to, to_count, - da->da_addr, da->da_addrlen, 0); - if (err < 0) - break; - da->da_synced = 1; - da->da_users++; - } else if (da->da_users == 1) { - __dev_addr_delete(to, to_count, - da->da_addr, da->da_addrlen, 0); - __dev_addr_delete(from, from_count, - da->da_addr, da->da_addrlen, 0); - } - da = next; - } - return err; -} - -void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, - struct dev_addr_list **from, int *from_count) -{ - struct dev_addr_list *da, *next; - - da = *from; - while (da != NULL) { - next = da->next; - if (da->da_synced) { - __dev_addr_delete(to, to_count, - da->da_addr, da->da_addrlen, 0); - da->da_synced = 0; - __dev_addr_delete(from, from_count, - da->da_addr, da->da_addrlen, 0); - } - da = next; - } -} - -/** - * dev_unicast_sync - Synchronize device's unicast list to another device - * @to: destination device - * @from: source device - * - * Add newly added addresses to the destination device and release - * addresses that have no users left. The source device must be - * locked by netif_addr_lock_bh. - * - * This function is intended to be called from the dev->set_rx_mode - * function of layered software devices. - */ -int dev_unicast_sync(struct net_device *to, struct net_device *from) -{ - int err = 0; - - netif_addr_lock_bh(to); - err = __dev_addr_sync(&to->uc_list, &to->uc_count, - &from->uc_list, &from->uc_count); - if (!err) - __dev_set_rx_mode(to); - netif_addr_unlock_bh(to); - return err; -} -EXPORT_SYMBOL(dev_unicast_sync); - -/** - * dev_unicast_unsync - Remove synchronized addresses from the destination device - * @to: destination device - * @from: source device - * - * Remove all addresses that were added to the destination device by - * dev_unicast_sync(). This function is intended to be called from the - * dev->stop function of layered software devices. - */ -void dev_unicast_unsync(struct net_device *to, struct net_device *from) -{ - netif_addr_lock_bh(from); - netif_addr_lock(to); - - __dev_addr_unsync(&to->uc_list, &to->uc_count, - &from->uc_list, &from->uc_count); - __dev_set_rx_mode(to); - - netif_addr_unlock(to); - netif_addr_unlock_bh(from); -} -EXPORT_SYMBOL(dev_unicast_unsync); - -static void __dev_addr_discard(struct dev_addr_list **list) -{ - struct dev_addr_list *tmp; - - while (*list != NULL) { - tmp = *list; - *list = tmp->next; - if (tmp->da_users > tmp->da_gusers) - printk("__dev_addr_discard: address leakage! " - "da_users=%d\n", tmp->da_users); - kfree(tmp); - } -} - -static void dev_addr_discard(struct net_device *dev) -{ - netif_addr_lock_bh(dev); - - __dev_addr_discard(&dev->uc_list); - dev->uc_count = 0; - - __dev_addr_discard(&dev->mc_list); - dev->mc_count = 0; - - netif_addr_unlock_bh(dev); -} - -/** - * dev_get_flags - get flags reported to userspace - * @dev: device - * - * Get the combination of flag bits exported through APIs to userspace. - */ -unsigned dev_get_flags(const struct net_device *dev) -{ - unsigned flags; - - flags = (dev->flags & ~(IFF_PROMISC | - IFF_ALLMULTI | - IFF_RUNNING | - IFF_LOWER_UP | - IFF_DORMANT)) | - (dev->gflags & (IFF_PROMISC | - IFF_ALLMULTI)); - - if (netif_running(dev)) { - if (netif_oper_up(dev)) - flags |= IFF_RUNNING; - if (netif_carrier_ok(dev)) - flags |= IFF_LOWER_UP; - if (netif_dormant(dev)) - flags |= IFF_DORMANT; - } - - return flags; -} - -/** - * dev_change_flags - change device settings - * @dev: device - * @flags: device state flags - * - * Change settings on device based state flags. The flags are - * in the userspace exported format. - */ -int dev_change_flags(struct net_device *dev, unsigned flags) -{ - int ret, changes; - int old_flags = dev->flags; - - ASSERT_RTNL(); - - /* - * Set the flags on our device. - */ - - dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | - IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | - IFF_AUTOMEDIA)) | - (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | - IFF_ALLMULTI)); - - /* - * Load in the correct multicast list now the flags have changed. - */ - - if ((old_flags ^ flags) & IFF_MULTICAST) - dev_change_rx_flags(dev, IFF_MULTICAST); - - dev_set_rx_mode(dev); - - /* - * Have we downed the interface. We handle IFF_UP ourselves - * according to user attempts to set it, rather than blindly - * setting it. - */ - - ret = 0; - if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ - ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); - - if (!ret) - dev_set_rx_mode(dev); - } - - if (dev->flags & IFF_UP && - ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | - IFF_VOLATILE))) - call_netdevice_notifiers(NETDEV_CHANGE, dev); - - if ((flags ^ dev->gflags) & IFF_PROMISC) { - int inc = (flags & IFF_PROMISC) ? +1 : -1; - dev->gflags ^= IFF_PROMISC; - dev_set_promiscuity(dev, inc); - } - - /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI - is important. Some (broken) drivers set IFF_PROMISC, when - IFF_ALLMULTI is requested not asking us and not reporting. - */ - if ((flags ^ dev->gflags) & IFF_ALLMULTI) { - int inc = (flags & IFF_ALLMULTI) ? +1 : -1; - dev->gflags ^= IFF_ALLMULTI; - dev_set_allmulti(dev, inc); - } - - /* Exclude state transition flags, already notified */ - changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING); - if (changes) - rtmsg_ifinfo(RTM_NEWLINK, dev, changes); - - return ret; -} - -/** - * dev_set_mtu - Change maximum transfer unit - * @dev: device - * @new_mtu: new transfer unit - * - * Change the maximum transfer size of the network device. - */ -int dev_set_mtu(struct net_device *dev, int new_mtu) -{ - const struct net_device_ops *ops = dev->netdev_ops; - int err; - - if (new_mtu == dev->mtu) - return 0; - - /* MTU must be positive. */ - if (new_mtu < 0) - return -EINVAL; - - if (!netif_device_present(dev)) - return -ENODEV; - - err = 0; - if (ops->ndo_change_mtu) - err = ops->ndo_change_mtu(dev, new_mtu); - else - dev->mtu = new_mtu; - - if (!err && dev->flags & IFF_UP) - call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); - return err; -} - -/** - * dev_set_mac_address - Change Media Access Control Address - * @dev: device - * @sa: new address - * - * Change the hardware (MAC) address of the device - */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) -{ - const struct net_device_ops *ops = dev->netdev_ops; - int err; - - if (!ops->ndo_set_mac_address) - return -EOPNOTSUPP; - if (sa->sa_family != dev->type) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - err = ops->ndo_set_mac_address(dev, sa); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return err; -} - -/* - * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock) - */ -static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) -{ - int err; - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); - - if (!dev) - return -ENODEV; - - switch (cmd) { - case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = dev_get_flags(dev); - return 0; - - case SIOCGIFMETRIC: /* Get the metric on the interface - (currently unused) */ - ifr->ifr_metric = 0; - return 0; - - case SIOCGIFMTU: /* Get the MTU of a device */ - ifr->ifr_mtu = dev->mtu; - return 0; - - case SIOCGIFHWADDR: - if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); - else - memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - ifr->ifr_hwaddr.sa_family = dev->type; - return 0; - - case SIOCGIFSLAVE: - err = -EINVAL; - break; - - case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; - - case SIOCGIFINDEX: - ifr->ifr_ifindex = dev->ifindex; - return 0; - - case SIOCGIFTXQLEN: - ifr->ifr_qlen = dev->tx_queue_len; - return 0; - - default: - /* dev_ioctl() should ensure this case - * is never reached - */ - WARN_ON(1); - err = -EINVAL; - break; - - } - return err; -} - -/* - * Perform the SIOCxIFxxx calls, inside rtnl_lock() - */ -static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) -{ - int err; - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); - const struct net_device_ops *ops; - - if (!dev) - return -ENODEV; - - ops = dev->netdev_ops; - - switch (cmd) { - case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); - - case SIOCSIFMETRIC: /* Set the metric on the interface - (currently unused) */ - return -EOPNOTSUPP; - - case SIOCSIFMTU: /* Set the MTU of a device */ - return dev_set_mtu(dev, ifr->ifr_mtu); - - case SIOCSIFHWADDR: - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); - - case SIOCSIFHWBROADCAST: - if (ifr->ifr_hwaddr.sa_family != dev->type) - return -EINVAL; - memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return 0; - - case SIOCSIFMAP: - if (ops->ndo_set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return ops->ndo_set_config(dev, &ifr->ifr_map); - } - return -EOPNOTSUPP; - - case SIOCADDMULTI: - if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, - dev->addr_len, 1); - - case SIOCDELMULTI: - if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, - dev->addr_len, 1); - - case SIOCSIFTXQLEN: - if (ifr->ifr_qlen < 0) - return -EINVAL; - dev->tx_queue_len = ifr->ifr_qlen; - return 0; - - case SIOCSIFNAME: - ifr->ifr_newname[IFNAMSIZ-1] = '\0'; - return dev_change_name(dev, ifr->ifr_newname); - - /* - * Unknown or private ioctl - */ - - default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || - cmd == SIOCBONDRELEASE || - cmd == SIOCBONDSETHWADDR || - cmd == SIOCBONDSLAVEINFOQUERY || - cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCWANDEV) { - err = -EOPNOTSUPP; - if (ops->ndo_do_ioctl) { - if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); - else - err = -ENODEV; - } - } else - err = -EINVAL; - - } - return err; -} - -/* - * This function handles all "interface"-type I/O control requests. The actual - * 'doing' part of this is dev_ifsioc above. - */ - -/** - * dev_ioctl - network device ioctl - * @net: the applicable net namespace - * @cmd: command to issue - * @arg: pointer to a struct ifreq in user space - * - * Issue ioctl functions to devices. This is normally called by the - * user space syscall interfaces but can sometimes be useful for - * other purposes. The return value is the return from the syscall if - * positive or a negative errno code on error. - */ - -int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) -{ - struct ifreq ifr; - int ret; - char *colon; - - /* One special case: SIOCGIFCONF takes ifconf argument - and requires shared lock, because it sleeps writing - to user space. - */ - - if (cmd == SIOCGIFCONF) { - rtnl_lock(); - ret = dev_ifconf(net, (char __user *) arg); - rtnl_unlock(); - return ret; - } - if (cmd == SIOCGIFNAME) - return dev_ifname(net, (struct ifreq __user *)arg); - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - - ifr.ifr_name[IFNAMSIZ-1] = 0; - - colon = strchr(ifr.ifr_name, ':'); - if (colon) - *colon = 0; - - /* - * See which interface the caller is talking about. - */ - - switch (cmd) { - /* - * These ioctl calls: - * - can be done by all. - * - atomic and do not require locking. - * - return a value - */ - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFHWADDR: - case SIOCGIFSLAVE: - case SIOCGIFMAP: - case SIOCGIFINDEX: - case SIOCGIFTXQLEN: - dev_load(net, ifr.ifr_name); - read_lock(&dev_base_lock); - ret = dev_ifsioc_locked(net, &ifr, cmd); - read_unlock(&dev_base_lock); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - case SIOCETHTOOL: - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ethtool(net, &ifr); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - return a value - */ - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCSIFNAME: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - do not return a value - */ - case SIOCSIFFLAGS: - case SIOCSIFMETRIC: - case SIOCSIFMTU: - case SIOCSIFMAP: - case SIOCSIFHWADDR: - case SIOCSIFSLAVE: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCSIFHWBROADCAST: - case SIOCSIFTXQLEN: - case SIOCSMIIREG: - case SIOCBONDENSLAVE: - case SIOCBONDRELEASE: - case SIOCBONDSETHWADDR: - case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - /* fall through */ - case SIOCBONDSLAVEINFOQUERY: - case SIOCBONDINFOQUERY: - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - return ret; - - case SIOCGIFMEM: - /* Get the per device memory space. We can add this but - * currently do not support it */ - case SIOCSIFMEM: - /* Set the per device memory buffer space. - * Not applicable in our case */ - case SIOCSIFLINK: - return -EINVAL; - - /* - * Unknown or private ioctl. - */ - default: - if (cmd == SIOCWANDEV || - (cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(net, ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); - rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - return ret; - } - /* Take care of Wireless Extensions */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) - return wext_handle_ioctl(net, &ifr, cmd, arg); - return -EINVAL; - } -} - - -/** - * dev_new_index - allocate an ifindex - * @net: the applicable net namespace - * - * Returns a suitable unique value for a new device interface - * number. The caller must hold the rtnl semaphore or the - * dev_base_lock to be sure it remains unique. - */ -static int dev_new_index(struct net *net) -{ - static int ifindex; - for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(net, ifindex)) - return ifindex; - } -} - -/* Delayed registration/unregisteration */ -static LIST_HEAD(net_todo_list); - -static void net_set_todo(struct net_device *dev) -{ - list_add_tail(&dev->todo_list, &net_todo_list); -} - -static void rollback_registered(struct net_device *dev) -{ - BUG_ON(dev_boot_phase); - ASSERT_RTNL(); - - /* Some devices call without registering for initialization unwind. */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " - "was registered\n", dev->name, dev); - - WARN_ON(1); - return; - } - - BUG_ON(dev->reg_state != NETREG_REGISTERED); - - /* If device is running, close it first. */ - dev_close(dev); - - /* And unlink it from device chain. */ - unlist_netdevice(dev); - - dev->reg_state = NETREG_UNREGISTERING; - - synchronize_net(); - - /* Shutdown queueing discipline. */ - dev_shutdown(dev); - - - /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - - /* - * Flush the unicast and multicast chains - */ - dev_addr_discard(dev); - - if (dev->netdev_ops->ndo_uninit) - dev->netdev_ops->ndo_uninit(dev); - - /* Notifier chain MUST detach us from master device. */ - WARN_ON(dev->master); - - /* Remove entries from kobject tree */ - netdev_unregister_kobject(dev); - - synchronize_net(); - - dev_put(dev); -} - -static void __netdev_init_queue_locks_one(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_unused) -{ - spin_lock_init(&dev_queue->_xmit_lock); - netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); - dev_queue->xmit_lock_owner = -1; -} - -static void netdev_init_queue_locks(struct net_device *dev) -{ - netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); - __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); -} - -unsigned long netdev_fix_features(unsigned long features, const char *name) -{ - /* Fix illegal SG+CSUM combinations. */ - if ((features & NETIF_F_SG) && - !(features & NETIF_F_ALL_CSUM)) { - if (name) - printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " - "checksum feature.\n", name); - features &= ~NETIF_F_SG; - } - - /* TSO requires that SG is present as well. */ - if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { - if (name) - printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " - "SG feature.\n", name); - features &= ~NETIF_F_TSO; - } - - if (features & NETIF_F_UFO) { - if (!(features & NETIF_F_GEN_CSUM)) { - if (name) - printk(KERN_ERR "%s: Dropping NETIF_F_UFO " - "since no NETIF_F_HW_CSUM feature.\n", - name); - features &= ~NETIF_F_UFO; - } - - if (!(features & NETIF_F_SG)) { - if (name) - printk(KERN_ERR "%s: Dropping NETIF_F_UFO " - "since no NETIF_F_SG feature.\n", name); - features &= ~NETIF_F_UFO; - } - } - - return features; -} -EXPORT_SYMBOL(netdev_fix_features); - -/* Some devices need to (re-)set their netdev_ops inside - * ->init() or similar. If that happens, we have to setup - * the compat pointers again. - */ -void netdev_resync_ops(struct net_device *dev) -{ -#ifdef CONFIG_COMPAT_NET_DEV_OPS - const struct net_device_ops *ops = dev->netdev_ops; - - dev->init = ops->ndo_init; - dev->uninit = ops->ndo_uninit; - dev->open = ops->ndo_open; - dev->change_rx_flags = ops->ndo_change_rx_flags; - dev->set_rx_mode = ops->ndo_set_rx_mode; - dev->set_multicast_list = ops->ndo_set_multicast_list; - dev->set_mac_address = ops->ndo_set_mac_address; - dev->validate_addr = ops->ndo_validate_addr; - dev->do_ioctl = ops->ndo_do_ioctl; - dev->set_config = ops->ndo_set_config; - dev->change_mtu = ops->ndo_change_mtu; - dev->neigh_setup = ops->ndo_neigh_setup; - dev->tx_timeout = ops->ndo_tx_timeout; - dev->get_stats = ops->ndo_get_stats; - dev->vlan_rx_register = ops->ndo_vlan_rx_register; - dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; - dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; -#ifdef CONFIG_NET_POLL_CONTROLLER - dev->poll_controller = ops->ndo_poll_controller; -#endif -#endif -} -EXPORT_SYMBOL(netdev_resync_ops); - -/** - * register_netdevice - register a network device - * @dev: device to register - * - * Take a completed network device structure and add it to the kernel - * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier - * chain. 0 is returned on success. A negative errno code is returned - * on a failure to set up the device, or if the name is a duplicate. - * - * Callers must hold the rtnl semaphore. You may want - * register_netdev() instead of this. - * - * BUGS: - * The locking appears insufficient to guarantee two parallel registers - * will not get the same name. - */ - -int register_netdevice(struct net_device *dev) -{ - struct hlist_head *head; - struct hlist_node *p; - int ret; - struct net *net = dev_net(dev); - - BUG_ON(dev_boot_phase); - ASSERT_RTNL(); - - might_sleep(); - - /* When net_device's are persistent, this will be fatal. */ - BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); - BUG_ON(!net); - - spin_lock_init(&dev->addr_list_lock); - netdev_set_addr_lockdep_class(dev); - netdev_init_queue_locks(dev); - - dev->iflink = -1; - -#ifdef CONFIG_COMPAT_NET_DEV_OPS - /* Netdevice_ops API compatiability support. - * This is temporary until all network devices are converted. - */ - if (dev->netdev_ops) { - netdev_resync_ops(dev); - } else { - char drivername[64]; - pr_info("%s (%s): not using net_device_ops yet\n", - dev->name, netdev_drivername(dev, drivername, 64)); - - /* This works only because net_device_ops and the - compatiablity structure are the same. */ - dev->netdev_ops = (void *) &(dev->init); - } -#endif - - /* Init, if this function is available */ - if (dev->netdev_ops->ndo_init) { - ret = dev->netdev_ops->ndo_init(dev); - if (ret) { - if (ret > 0) - ret = -EIO; - goto out; - } - } - - if (!dev_valid_name(dev->name)) { - ret = -EINVAL; - goto err_uninit; - } - - dev->ifindex = dev_new_index(net); - if (dev->iflink == -1) - dev->iflink = dev->ifindex; - - /* Check for existence of name */ - head = dev_name_hash(net, dev->name); - hlist_for_each(p, head) { - struct net_device *d - = hlist_entry(p, struct net_device, name_hlist); - if (!strncmp(d->name, dev->name, IFNAMSIZ)) { - ret = -EEXIST; - goto err_uninit; - } - } - - /* Fix illegal checksum combinations */ - if ((dev->features & NETIF_F_HW_CSUM) && - (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", - dev->name); - dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); - } - - if ((dev->features & NETIF_F_NO_CSUM) && - (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", - dev->name); - dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); - } - - dev->features = netdev_fix_features(dev->features, dev->name); - - /* Enable software GSO if SG is supported. */ - if (dev->features & NETIF_F_SG) - dev->features |= NETIF_F_GSO; - - netdev_initialize_kobject(dev); - ret = netdev_register_kobject(dev); - if (ret) - goto err_uninit; - dev->reg_state = NETREG_REGISTERED; - - /* - * Default initial state at registry is that the - * device is present. - */ - - set_bit(__LINK_STATE_PRESENT, &dev->state); - - dev_init_scheduler(dev); - dev_hold(dev); - list_netdevice(dev); - - /* Notify protocols, that a new device appeared. */ - ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); - ret = notifier_to_errno(ret); - if (ret) { - rollback_registered(dev); - dev->reg_state = NETREG_UNREGISTERED; - } - -out: - return ret; - -err_uninit: - if (dev->netdev_ops->ndo_uninit) - dev->netdev_ops->ndo_uninit(dev); - goto out; -} - -/** - * init_dummy_netdev - init a dummy network device for NAPI - * @dev: device to init - * - * This takes a network device structure and initialize the minimum - * amount of fields so it can be used to schedule NAPI polls without - * registering a full blown interface. This is to be used by drivers - * that need to tie several hardware interfaces to a single NAPI - * poll scheduler due to HW limitations. - */ -int init_dummy_netdev(struct net_device *dev) -{ - /* Clear everything. Note we don't initialize spinlocks - * are they aren't supposed to be taken by any of the - * NAPI code and this dummy netdev is supposed to be - * only ever used for NAPI polls - */ - memset(dev, 0, sizeof(struct net_device)); - - /* make sure we BUG if trying to hit standard - * register/unregister code path - */ - dev->reg_state = NETREG_DUMMY; - - /* initialize the ref count */ - atomic_set(&dev->refcnt, 1); - - /* NAPI wants this */ - INIT_LIST_HEAD(&dev->napi_list); - - /* a dummy interface is started by default */ - set_bit(__LINK_STATE_PRESENT, &dev->state); - set_bit(__LINK_STATE_START, &dev->state); - - return 0; -} -EXPORT_SYMBOL_GPL(init_dummy_netdev); - - -/** - * register_netdev - register a network device - * @dev: device to register - * - * Take a completed network device structure and add it to the kernel - * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier - * chain. 0 is returned on success. A negative errno code is returned - * on a failure to set up the device, or if the name is a duplicate. - * - * This is a wrapper around register_netdevice that takes the rtnl semaphore - * and expands the device name if you passed a format string to - * alloc_netdev. - */ -int register_netdev(struct net_device *dev) -{ - int err; - - rtnl_lock(); - - /* - * If the name is a format string the caller wants us to do a - * name allocation. - */ - if (strchr(dev->name, '%')) { - err = dev_alloc_name(dev, dev->name); - if (err < 0) - goto out; - } - - err = register_netdevice(dev); -out: - rtnl_unlock(); - return err; -} -EXPORT_SYMBOL(register_netdev); - -/* - * netdev_wait_allrefs - wait until all references are gone. - * - * This is called when unregistering network devices. - * - * Any protocol or device that holds a reference should register - * for netdevice notification, and cleanup and put back the - * reference if they receive an UNREGISTER event. - * We can get stuck here if buggy protocols don't correctly - * call dev_put. - */ -static void netdev_wait_allrefs(struct net_device *dev) -{ - unsigned long rebroadcast_time, warning_time; - - rebroadcast_time = warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 0) { - if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { - rtnl_lock(); - - /* Rebroadcast unregister notification */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - - if (test_bit(__LINK_STATE_LINKWATCH_PENDING, - &dev->state)) { - /* We must not have linkwatch events - * pending on unregister. If this - * happens, we simply run the queue - * unscheduled, resulting in a noop - * for this device. - */ - linkwatch_run_queue(); - } - - __rtnl_unlock(); - - rebroadcast_time = jiffies; - } - - msleep(250); - - if (time_after(jiffies, warning_time + 10 * HZ)) { - printk(KERN_EMERG "unregister_netdevice: " - "waiting for %s to become free. Usage " - "count = %d\n", - dev->name, atomic_read(&dev->refcnt)); - warning_time = jiffies; - } - } -} - -/* The sequence is: - * - * rtnl_lock(); - * ... - * register_netdevice(x1); - * register_netdevice(x2); - * ... - * unregister_netdevice(y1); - * unregister_netdevice(y2); - * ... - * rtnl_unlock(); - * free_netdev(y1); - * free_netdev(y2); - * - * We are invoked by rtnl_unlock(). - * This allows us to deal with problems: - * 1) We can delete sysfs objects which invoke hotplug - * without deadlocking with linkwatch via keventd. - * 2) Since we run with the RTNL semaphore not held, we can sleep - * safely in order to wait for the netdev refcnt to drop to zero. - * - * We must not return until all unregister events added during - * the interval the lock was held have been completed. - */ -void netdev_run_todo(void) -{ - struct list_head list; - - /* Snapshot list, allow later requests */ - list_replace_init(&net_todo_list, &list); - - __rtnl_unlock(); - - while (!list_empty(&list)) { - struct net_device *dev - = list_entry(list.next, struct net_device, todo_list); - list_del(&dev->todo_list); - - if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { - printk(KERN_ERR "network todo '%s' but state %d\n", - dev->name, dev->reg_state); - dump_stack(); - continue; - } - - dev->reg_state = NETREG_UNREGISTERED; - - on_each_cpu(flush_backlog, dev, 1); - - netdev_wait_allrefs(dev); - - /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); - WARN_ON(dev->ip_ptr); - WARN_ON(dev->ip6_ptr); - WARN_ON(dev->dn_ptr); - - if (dev->destructor) - dev->destructor(dev); - - /* Free network device */ - kobject_put(&dev->dev.kobj); - } -} - -/** - * dev_get_stats - get network device statistics - * @dev: device to get statistics from - * - * Get network statistics from device. The device driver may provide - * its own method by setting dev->netdev_ops->get_stats; otherwise - * the internal statistics structure is used. - */ -const struct net_device_stats *dev_get_stats(struct net_device *dev) - { - const struct net_device_ops *ops = dev->netdev_ops; - - if (ops->ndo_get_stats) - return ops->ndo_get_stats(dev); - else - return &dev->stats; -} -EXPORT_SYMBOL(dev_get_stats); - -static void netdev_init_one_queue(struct net_device *dev, - struct netdev_queue *queue, - void *_unused) -{ - queue->dev = dev; -} - -static void netdev_init_queues(struct net_device *dev) -{ - netdev_init_one_queue(dev, &dev->rx_queue, NULL); - netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); - spin_lock_init(&dev->tx_global_lock); -} - -/** - * alloc_netdev_mq - allocate network device - * @sizeof_priv: size of private data to allocate space for - * @name: device name format string - * @setup: callback to initialize device - * @queue_count: the number of subqueues to allocate - * - * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. Also allocates subquue structs - * for each queue on the device at the end of the netdevice. - */ -struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, - void (*setup)(struct net_device *), unsigned int queue_count) -{ - struct netdev_queue *tx; - struct net_device *dev; - size_t alloc_size; - void *p; - - BUG_ON(strlen(name) >= sizeof(dev->name)); - - alloc_size = sizeof(struct net_device); - if (sizeof_priv) { - /* ensure 32-byte alignment of private area */ - alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; - alloc_size += sizeof_priv; - } - /* ensure 32-byte alignment of whole construct */ - alloc_size += NETDEV_ALIGN_CONST; - - p = kzalloc(alloc_size, GFP_KERNEL); - if (!p) { - printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); - return NULL; - } - - tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); - if (!tx) { - printk(KERN_ERR "alloc_netdev: Unable to allocate " - "tx qdiscs.\n"); - kfree(p); - return NULL; - } - - dev = (struct net_device *) - (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); - dev->padded = (char *)dev - (char *)p; - dev_net_set(dev, &init_net); - - dev->_tx = tx; - dev->num_tx_queues = queue_count; - dev->real_num_tx_queues = queue_count; - - dev->gso_max_size = GSO_MAX_SIZE; - - netdev_init_queues(dev); - - INIT_LIST_HEAD(&dev->napi_list); - setup(dev); - strcpy(dev->name, name); - return dev; -} -EXPORT_SYMBOL(alloc_netdev_mq); - -/** - * free_netdev - free network device - * @dev: device - * - * This function does the last stage of destroying an allocated device - * interface. The reference to the device object is released. - * If this is the last reference then it will be freed. - */ -void free_netdev(struct net_device *dev) -{ - struct napi_struct *p, *n; - - release_net(dev_net(dev)); - - kfree(dev->_tx); - - list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) - netif_napi_del(p); - - /* Compatibility with error handling in drivers */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - kfree((char *)dev - dev->padded); - return; - } - - BUG_ON(dev->reg_state != NETREG_UNREGISTERED); - dev->reg_state = NETREG_RELEASED; - - /* will free via device release */ - put_device(&dev->dev); -} - -/** - * synchronize_net - Synchronize with packet receive processing - * - * Wait for packets currently being received to be done. - * Does not block later packets from starting. - */ -void synchronize_net(void) -{ - might_sleep(); -#ifndef DDE_LINUX - synchronize_rcu(); -#endif -} - -/** - * unregister_netdevice - remove device from the kernel - * @dev: device - * - * This function shuts down a device interface and removes it - * from the kernel tables. - * - * Callers must hold the rtnl semaphore. You may want - * unregister_netdev() instead of this. - */ - -void unregister_netdevice(struct net_device *dev) -{ - ASSERT_RTNL(); - - rollback_registered(dev); - /* Finish processing unregister after unlock */ - net_set_todo(dev); -} - -/** - * unregister_netdev - remove device from the kernel - * @dev: device - * - * This function shuts down a device interface and removes it - * from the kernel tables. - * - * This is just a wrapper for unregister_netdevice that takes - * the rtnl semaphore. In general you want to use this and not - * unregister_netdevice. - */ -void unregister_netdev(struct net_device *dev) -{ - rtnl_lock(); - unregister_netdevice(dev); - rtnl_unlock(); -} - -EXPORT_SYMBOL(unregister_netdev); - -/** - * dev_change_net_namespace - move device to different nethost namespace - * @dev: device - * @net: network namespace - * @pat: If not NULL name pattern to try if the current device name - * is already taken in the destination network namespace. - * - * This function shuts down a device interface and moves it - * to a new network namespace. On success 0 is returned, on - * a failure a netagive errno code is returned. - * - * Callers must hold the rtnl semaphore. - */ - -int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) -{ - char buf[IFNAMSIZ]; - const char *destname; - int err; - - ASSERT_RTNL(); - - /* Don't allow namespace local devices to be moved. */ - err = -EINVAL; - if (dev->features & NETIF_F_NETNS_LOCAL) - goto out; - -#ifdef CONFIG_SYSFS - /* Don't allow real devices to be moved when sysfs - * is enabled. - */ - err = -EINVAL; - if (dev->dev.parent) - goto out; -#endif - - /* Ensure the device has been registrered */ - err = -EINVAL; - if (dev->reg_state != NETREG_REGISTERED) - goto out; - - /* Get out if there is nothing todo */ - err = 0; - if (net_eq(dev_net(dev), net)) - goto out; - - /* Pick the destination device name, and ensure - * we can use it in the destination network namespace. - */ - err = -EEXIST; - destname = dev->name; - if (__dev_get_by_name(net, destname)) { - /* We get here if we can't use the current device name */ - if (!pat) - goto out; - if (!dev_valid_name(pat)) - goto out; - if (strchr(pat, '%')) { - if (__dev_alloc_name(net, pat, buf) < 0) - goto out; - destname = buf; - } else - destname = pat; - if (__dev_get_by_name(net, destname)) - goto out; - } - - /* - * And now a mini version of register_netdevice unregister_netdevice. - */ - - /* If device is running close it first. */ - dev_close(dev); - - /* And unlink it from device chain */ - err = -ENODEV; - unlist_netdevice(dev); - - synchronize_net(); - - /* Shutdown queueing discipline. */ - dev_shutdown(dev); - - /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - - /* - * Flush the unicast and multicast chains - */ - dev_addr_discard(dev); - - netdev_unregister_kobject(dev); - - /* Actually switch the network namespace */ - dev_net_set(dev, net); - - /* Assign the new device name */ - if (destname != dev->name) - strcpy(dev->name, destname); - - /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) { - int iflink = (dev->iflink == dev->ifindex); - dev->ifindex = dev_new_index(net); - if (iflink) - dev->iflink = dev->ifindex; - } - - /* Fixup kobjects */ - err = netdev_register_kobject(dev); - WARN_ON(err); - - /* Add the device back in the hashes */ - list_netdevice(dev); - - /* Notify protocols, that a new device appeared. */ - call_netdevice_notifiers(NETDEV_REGISTER, dev); - - synchronize_net(); - err = 0; -out: - return err; -} - -static int dev_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *ocpu) -{ - struct sk_buff **list_skb; - struct Qdisc **list_net; - struct sk_buff *skb; - unsigned int cpu, oldcpu = (unsigned long)ocpu; - struct softnet_data *sd, *oldsd; - - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return NOTIFY_OK; - - local_irq_disable(); - cpu = smp_processor_id(); - sd = &per_cpu(softnet_data, cpu); - oldsd = &per_cpu(softnet_data, oldcpu); - - /* Find end of our completion_queue. */ - list_skb = &sd->completion_queue; - while (*list_skb) - list_skb = &(*list_skb)->next; - /* Append completion queue from offline CPU. */ - *list_skb = oldsd->completion_queue; - oldsd->completion_queue = NULL; - - /* Find end of our output_queue. */ - list_net = &sd->output_queue; - while (*list_net) - list_net = &(*list_net)->next_sched; - /* Append output queue from offline CPU. */ - *list_net = oldsd->output_queue; - oldsd->output_queue = NULL; - - raise_softirq_irqoff(NET_TX_SOFTIRQ); - local_irq_enable(); - - /* Process offline CPU's input_pkt_queue */ - while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) - netif_rx(skb); - - return NOTIFY_OK; -} - - -/** - * netdev_increment_features - increment feature set by one - * @all: current feature set - * @one: new feature set - * @mask: mask feature set - * - * Computes a new feature set after adding a device with feature set - * @one to the master device with current feature set @all. Will not - * enable anything that is off in @mask. Returns the new feature set. - */ -unsigned long netdev_increment_features(unsigned long all, unsigned long one, - unsigned long mask) -{ - /* If device needs checksumming, downgrade to it. */ - if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) - all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); - else if (mask & NETIF_F_ALL_CSUM) { - /* If one device supports v4/v6 checksumming, set for all. */ - if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) && - !(all & NETIF_F_GEN_CSUM)) { - all &= ~NETIF_F_ALL_CSUM; - all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); - } - - /* If one device supports hw checksumming, set for all. */ - if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { - all &= ~NETIF_F_ALL_CSUM; - all |= NETIF_F_HW_CSUM; - } - } - - one |= NETIF_F_ALL_CSUM; - - one |= all & NETIF_F_ONE_FOR_ALL; - all &= one | NETIF_F_LLTX | NETIF_F_GSO; - all |= one & mask & NETIF_F_ONE_FOR_ALL; - - return all; -} -EXPORT_SYMBOL(netdev_increment_features); - -static struct hlist_head *netdev_create_hash(void) -{ - int i; - struct hlist_head *hash; - - hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); - if (hash != NULL) - for (i = 0; i < NETDEV_HASHENTRIES; i++) - INIT_HLIST_HEAD(&hash[i]); - - return hash; -} - -/* Initialize per network namespace state */ -static int __net_init netdev_init(struct net *net) -{ - INIT_LIST_HEAD(&net->dev_base_head); - - net->dev_name_head = netdev_create_hash(); - if (net->dev_name_head == NULL) - goto err_name; - - net->dev_index_head = netdev_create_hash(); - if (net->dev_index_head == NULL) - goto err_idx; - - return 0; - -err_idx: - kfree(net->dev_name_head); -err_name: - return -ENOMEM; -} - -/** - * netdev_drivername - network driver for the device - * @dev: network device - * @buffer: buffer for resulting name - * @len: size of buffer - * - * Determine network driver for device. - */ -char *netdev_drivername(const struct net_device *dev, char *buffer, int len) -{ - const struct device_driver *driver; - const struct device *parent; - - if (len <= 0 || !buffer) - return buffer; - buffer[0] = 0; - - parent = dev->dev.parent; - - if (!parent) - return buffer; - - driver = parent->driver; - if (driver && driver->name) - strlcpy(buffer, driver->name, len); - return buffer; -} - -static void __net_exit netdev_exit(struct net *net) -{ - kfree(net->dev_name_head); - kfree(net->dev_index_head); -} - -static struct pernet_operations __net_initdata netdev_net_ops = { - .init = netdev_init, - .exit = netdev_exit, -}; - -static void __net_exit default_device_exit(struct net *net) -{ - struct net_device *dev; - /* - * Push all migratable of the network devices back to the - * initial network namespace - */ - rtnl_lock(); -restart: - for_each_netdev(net, dev) { - int err; - char fb_name[IFNAMSIZ]; - - /* Ignore unmoveable devices (i.e. loopback) */ - if (dev->features & NETIF_F_NETNS_LOCAL) - continue; - - /* Delete virtual devices */ - if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { - dev->rtnl_link_ops->dellink(dev); - goto restart; - } - - /* Push remaing network devices to init_net */ - snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); - err = dev_change_net_namespace(dev, &init_net, fb_name); - if (err) { - printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n", - __func__, dev->name, err); - BUG(); - } - goto restart; - } - rtnl_unlock(); -} - -static struct pernet_operations __net_initdata default_device_ops = { - .exit = default_device_exit, -}; - -/* - * Initialize the DEV module. At boot time this walks the device list and - * unhooks any devices that fail to initialise (normally hardware not - * present) and leaves us with a valid list of present and active devices. - * - */ - -/* - * This is called single threaded during boot, so no need - * to take the rtnl semaphore. - */ -static int __init net_dev_init(void) -{ - int i, rc = -ENOMEM; - - BUG_ON(!dev_boot_phase); - - if (dev_proc_init()) - goto out; - - if (netdev_kobject_init()) - goto out; - - INIT_LIST_HEAD(&ptype_all); - for (i = 0; i < PTYPE_HASH_SIZE; i++) - INIT_LIST_HEAD(&ptype_base[i]); - - if (register_pernet_subsys(&netdev_net_ops)) - goto out; - - /* - * Initialise the packet receive queues. - */ - - for_each_possible_cpu(i) { - struct softnet_data *queue; - - queue = &per_cpu(softnet_data, i); - skb_queue_head_init(&queue->input_pkt_queue); - queue->completion_queue = NULL; - INIT_LIST_HEAD(&queue->poll_list); - - queue->backlog.poll = process_backlog; - queue->backlog.weight = weight_p; - queue->backlog.gro_list = NULL; - } - - dev_boot_phase = 0; - - /* The loopback device is special if any other network devices - * is present in a network namespace the loopback device must - * be present. Since we now dynamically allocate and free the - * loopback device ensure this invariant is maintained by - * keeping the loopback device as the first device on the - * list of network devices. Ensuring the loopback devices - * is the first device that appears and the last network device - * that disappears. - */ -#ifndef DDE_LINUX - if (register_pernet_device(&loopback_net_ops)) - goto out; -#endif - - if (register_pernet_device(&default_device_ops)) - goto out; - - open_softirq(NET_TX_SOFTIRQ, net_tx_action); - open_softirq(NET_RX_SOFTIRQ, net_rx_action); - - hotcpu_notifier(dev_cpu_callback, 0); -#ifndef DDE_LINUX - dst_init(); -#endif - dev_mcast_init(); - rc = 0; -out: - return rc; -} - -subsys_initcall(net_dev_init); - -EXPORT_SYMBOL(__dev_get_by_index); -EXPORT_SYMBOL(__dev_get_by_name); -EXPORT_SYMBOL(__dev_remove_pack); -EXPORT_SYMBOL(dev_valid_name); -EXPORT_SYMBOL(dev_add_pack); -EXPORT_SYMBOL(dev_alloc_name); -EXPORT_SYMBOL(dev_close); -EXPORT_SYMBOL(dev_get_by_flags); -EXPORT_SYMBOL(dev_get_by_index); -EXPORT_SYMBOL(dev_get_by_name); -EXPORT_SYMBOL(dev_open); -EXPORT_SYMBOL(dev_queue_xmit); -EXPORT_SYMBOL(dev_remove_pack); -EXPORT_SYMBOL(dev_set_allmulti); -EXPORT_SYMBOL(dev_set_promiscuity); -EXPORT_SYMBOL(dev_change_flags); -EXPORT_SYMBOL(dev_set_mtu); -EXPORT_SYMBOL(dev_set_mac_address); -EXPORT_SYMBOL(free_netdev); -EXPORT_SYMBOL(netdev_boot_setup_check); -EXPORT_SYMBOL(netdev_set_master); -EXPORT_SYMBOL(netdev_state_change); -EXPORT_SYMBOL(netif_receive_skb); -EXPORT_SYMBOL(netif_rx); -EXPORT_SYMBOL(register_gifconf); -EXPORT_SYMBOL(register_netdevice); -EXPORT_SYMBOL(register_netdevice_notifier); -EXPORT_SYMBOL(skb_checksum_help); -EXPORT_SYMBOL(synchronize_net); -EXPORT_SYMBOL(unregister_netdevice); -EXPORT_SYMBOL(unregister_netdevice_notifier); -EXPORT_SYMBOL(net_enable_timestamp); -EXPORT_SYMBOL(net_disable_timestamp); -EXPORT_SYMBOL(dev_get_flags); - -#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) -EXPORT_SYMBOL(br_handle_frame_hook); -EXPORT_SYMBOL(br_fdb_get_hook); -EXPORT_SYMBOL(br_fdb_put_hook); -#endif - -#ifdef CONFIG_KMOD -EXPORT_SYMBOL(dev_load); -#endif - -EXPORT_PER_CPU_SYMBOL(softnet_data); diff --git a/libdde_linux26/lib/src/net/core/.svn/text-base/link_watch.c.svn-base b/libdde_linux26/lib/src/net/core/.svn/text-base/link_watch.c.svn-base deleted file mode 100644 index 1afdb815..00000000 --- a/libdde_linux26/lib/src/net/core/.svn/text-base/link_watch.c.svn-base +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Linux network device link state notification - * - * Author: - * Stefan Rompf <sux@loplof.de> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/if.h> -#include <net/sock.h> -#include <net/pkt_sched.h> -#include <linux/rtnetlink.h> -#include <linux/jiffies.h> -#include <linux/spinlock.h> -#include <linux/slab.h> -#include <linux/workqueue.h> -#include <linux/bitops.h> -#include <asm/types.h> - - -enum lw_bits { - LW_URGENT = 0, -}; - -static unsigned long linkwatch_flags; -static unsigned long linkwatch_nextevent; - -static void linkwatch_event(struct work_struct *dummy); -static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); - -static struct net_device *lweventlist; -static DEFINE_SPINLOCK(lweventlist_lock); - -static unsigned char default_operstate(const struct net_device *dev) -{ -#ifndef DDE_LINUX - if (!netif_carrier_ok(dev)) - return (dev->ifindex != dev->iflink ? - IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); - - if (netif_dormant(dev)) - return IF_OPER_DORMANT; -#endif - - return IF_OPER_UP; -} - - -static void rfc2863_policy(struct net_device *dev) -{ -#ifndef DDE_LINUX - unsigned char operstate = default_operstate(dev); - - if (operstate == dev->operstate) - return; - - write_lock_bh(&dev_base_lock); - - switch(dev->link_mode) { - case IF_LINK_MODE_DORMANT: - if (operstate == IF_OPER_UP) - operstate = IF_OPER_DORMANT; - break; - - case IF_LINK_MODE_DEFAULT: - default: - break; - } - - dev->operstate = operstate; - - write_unlock_bh(&dev_base_lock); -#endif -} - - -static bool linkwatch_urgent_event(struct net_device *dev) -{ - return netif_running(dev) && netif_carrier_ok(dev) && - qdisc_tx_changing(dev); -} - - -static void linkwatch_add_event(struct net_device *dev) -{ - unsigned long flags; - - spin_lock_irqsave(&lweventlist_lock, flags); - dev->link_watch_next = lweventlist; - lweventlist = dev; - spin_unlock_irqrestore(&lweventlist_lock, flags); -} - - -static void linkwatch_schedule_work(int urgent) -{ - unsigned long delay = linkwatch_nextevent - jiffies; - - if (test_bit(LW_URGENT, &linkwatch_flags)) - return; - - /* Minimise down-time: drop delay for up event. */ - if (urgent) { - if (test_and_set_bit(LW_URGENT, &linkwatch_flags)) - return; - delay = 0; - } - - /* If we wrap around we'll delay it by at most HZ. */ - if (delay > HZ) - delay = 0; - - /* - * This is true if we've scheduled it immeditately or if we don't - * need an immediate execution and it's already pending. - */ - if (schedule_delayed_work(&linkwatch_work, delay) == !delay) - return; - - /* Don't bother if there is nothing urgent. */ - if (!test_bit(LW_URGENT, &linkwatch_flags)) - return; - - /* It's already running which is good enough. */ - if (!cancel_delayed_work(&linkwatch_work)) - return; - - /* Otherwise we reschedule it again for immediate exection. */ - schedule_delayed_work(&linkwatch_work, 0); -} - - -static void __linkwatch_run_queue(int urgent_only) -{ -#ifndef DDE_LINUX - struct net_device *next; - - /* - * Limit the number of linkwatch events to one - * per second so that a runaway driver does not - * cause a storm of messages on the netlink - * socket. This limit does not apply to up events - * while the device qdisc is down. - */ - if (!urgent_only) - linkwatch_nextevent = jiffies + HZ; - /* Limit wrap-around effect on delay. */ - else if (time_after(linkwatch_nextevent, jiffies + HZ)) - linkwatch_nextevent = jiffies; - - clear_bit(LW_URGENT, &linkwatch_flags); - - spin_lock_irq(&lweventlist_lock); - next = lweventlist; - lweventlist = NULL; - spin_unlock_irq(&lweventlist_lock); - - while (next) { - struct net_device *dev = next; - - next = dev->link_watch_next; - - if (urgent_only && !linkwatch_urgent_event(dev)) { - linkwatch_add_event(dev); - continue; - } - - /* - * Make sure the above read is complete since it can be - * rewritten as soon as we clear the bit below. - */ - smp_mb__before_clear_bit(); - - /* We are about to handle this device, - * so new events can be accepted - */ - clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); - - rfc2863_policy(dev); - if (dev->flags & IFF_UP) { - if (netif_carrier_ok(dev)) - dev_activate(dev); - else - dev_deactivate(dev); - - netdev_state_change(dev); - } - - dev_put(dev); - } - - if (lweventlist) - linkwatch_schedule_work(0); -#endif -} - - -/* Must be called with the rtnl semaphore held */ -void linkwatch_run_queue(void) -{ - __linkwatch_run_queue(0); -} - - -static void linkwatch_event(struct work_struct *dummy) -{ -#ifndef DDE_LINUX - rtnl_lock(); - __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies)); - rtnl_unlock(); -#endif -} - - -void linkwatch_fire_event(struct net_device *dev) -{ -#ifndef DDE_LINUX - bool urgent = linkwatch_urgent_event(dev); - - if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { - dev_hold(dev); - - linkwatch_add_event(dev); - } else if (!urgent) - return; - - linkwatch_schedule_work(urgent); -#endif -} - -EXPORT_SYMBOL(linkwatch_fire_event); diff --git a/libdde_linux26/lib/src/net/core/.svn/text-base/net_namespace.c.svn-base b/libdde_linux26/lib/src/net/core/.svn/text-base/net_namespace.c.svn-base deleted file mode 100644 index ab5a0a7f..00000000 --- a/libdde_linux26/lib/src/net/core/.svn/text-base/net_namespace.c.svn-base +++ /dev/null @@ -1,511 +0,0 @@ -#include <linux/workqueue.h> -#include <linux/rtnetlink.h> -#include <linux/cache.h> -#include <linux/slab.h> -#include <linux/list.h> -#include <linux/delay.h> -#include <linux/sched.h> -#include <linux/idr.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> - -/* - * Our network namespace constructor/destructor lists - */ - -static LIST_HEAD(pernet_list); -static struct list_head *first_device = &pernet_list; -static DEFINE_MUTEX(net_mutex); - -LIST_HEAD(net_namespace_list); -EXPORT_SYMBOL_GPL(net_namespace_list); - -struct net init_net; -EXPORT_SYMBOL(init_net); - -#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ - -/* - * setup_net runs the initializers for the network namespace object. - */ -static __net_init int setup_net(struct net *net) -{ - /* Must be called with net_mutex held */ - struct pernet_operations *ops; - int error = 0; - - atomic_set(&net->count, 1); - -#ifdef NETNS_REFCNT_DEBUG - atomic_set(&net->use_count, 0); -#endif - - list_for_each_entry(ops, &pernet_list, list) { - if (ops->init) { - error = ops->init(net); - if (error < 0) - goto out_undo; - } - } -out: - return error; - -out_undo: - /* Walk through the list backwards calling the exit functions - * for the pernet modules whose init functions did not fail. - */ - list_for_each_entry_continue_reverse(ops, &pernet_list, list) { - if (ops->exit) - ops->exit(net); - } - -#ifndef DDE_LINUX - rcu_barrier(); -#endif - goto out; -} - -static struct net_generic *net_alloc_generic(void) -{ - struct net_generic *ng; - size_t generic_size = sizeof(struct net_generic) + - INITIAL_NET_GEN_PTRS * sizeof(void *); - - ng = kzalloc(generic_size, GFP_KERNEL); - if (ng) - ng->len = INITIAL_NET_GEN_PTRS; - - return ng; -} - -#ifdef CONFIG_NET_NS -static struct kmem_cache *net_cachep; -static struct workqueue_struct *netns_wq; - -static struct net *net_alloc(void) -{ - struct net *net = NULL; - struct net_generic *ng; - - ng = net_alloc_generic(); - if (!ng) - goto out; - - net = kmem_cache_zalloc(net_cachep, GFP_KERNEL); - if (!net) - goto out_free; - - rcu_assign_pointer(net->gen, ng); -out: - return net; - -out_free: - kfree(ng); - goto out; -} - -static void net_free(struct net *net) -{ -#ifdef NETNS_REFCNT_DEBUG - if (unlikely(atomic_read(&net->use_count) != 0)) { - printk(KERN_EMERG "network namespace not free! Usage: %d\n", - atomic_read(&net->use_count)); - return; - } -#endif - kfree(net->gen); - kmem_cache_free(net_cachep, net); -} - -struct net *copy_net_ns(unsigned long flags, struct net *old_net) -{ - struct net *new_net = NULL; - int err; - - get_net(old_net); - - if (!(flags & CLONE_NEWNET)) - return old_net; - - err = -ENOMEM; - new_net = net_alloc(); - if (!new_net) - goto out_err; - - mutex_lock(&net_mutex); - err = setup_net(new_net); - if (!err) { - rtnl_lock(); - list_add_tail(&new_net->list, &net_namespace_list); - rtnl_unlock(); - } - mutex_unlock(&net_mutex); - - if (err) - goto out_free; -out: - put_net(old_net); - return new_net; - -out_free: - net_free(new_net); -out_err: - new_net = ERR_PTR(err); - goto out; -} - -static void cleanup_net(struct work_struct *work) -{ - struct pernet_operations *ops; - struct net *net; - - net = container_of(work, struct net, work); - - mutex_lock(&net_mutex); - - /* Don't let anyone else find us. */ - rtnl_lock(); - list_del(&net->list); - rtnl_unlock(); - - /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit) - ops->exit(net); - } - - mutex_unlock(&net_mutex); - - /* Ensure there are no outstanding rcu callbacks using this - * network namespace. - */ - rcu_barrier(); - - /* Finally it is safe to free my network namespace structure */ - net_free(net); -} - -void __put_net(struct net *net) -{ - /* Cleanup the network namespace in process context */ - INIT_WORK(&net->work, cleanup_net); - queue_work(netns_wq, &net->work); -} -EXPORT_SYMBOL_GPL(__put_net); - -#else -struct net *copy_net_ns(unsigned long flags, struct net *old_net) -{ - if (flags & CLONE_NEWNET) - return ERR_PTR(-EINVAL); - return old_net; -} -#endif - -static int __init net_ns_init(void) -{ - struct net_generic *ng; - int err; - - printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net)); -#ifdef CONFIG_NET_NS - net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), - SMP_CACHE_BYTES, - SLAB_PANIC, NULL); - - /* Create workqueue for cleanup */ - netns_wq = create_singlethread_workqueue("netns"); - if (!netns_wq) - panic("Could not create netns workq"); -#endif - - ng = net_alloc_generic(); - if (!ng) - panic("Could not allocate generic netns"); - - rcu_assign_pointer(init_net.gen, ng); - - mutex_lock(&net_mutex); - err = setup_net(&init_net); - - rtnl_lock(); - list_add_tail(&init_net.list, &net_namespace_list); - rtnl_unlock(); - - mutex_unlock(&net_mutex); - if (err) - panic("Could not setup the initial network namespace"); - - return 0; -} - -pure_initcall(net_ns_init); - -#ifdef CONFIG_NET_NS -static int register_pernet_operations(struct list_head *list, - struct pernet_operations *ops) -{ - struct net *net, *undo_net; - int error; - - list_add_tail(&ops->list, list); - if (ops->init) { - for_each_net(net) { - error = ops->init(net); - if (error) - goto out_undo; - } - } - return 0; - -out_undo: - /* If I have an error cleanup all namespaces I initialized */ - list_del(&ops->list); - if (ops->exit) { - for_each_net(undo_net) { - if (undo_net == net) - goto undone; - ops->exit(undo_net); - } - } -undone: - return error; -} - -static void unregister_pernet_operations(struct pernet_operations *ops) -{ - struct net *net; - - list_del(&ops->list); - if (ops->exit) - for_each_net(net) - ops->exit(net); -} - -#else - -static int register_pernet_operations(struct list_head *list, - struct pernet_operations *ops) -{ - if (ops->init == NULL) - return 0; - return ops->init(&init_net); -} - -static void unregister_pernet_operations(struct pernet_operations *ops) -{ - if (ops->exit) - ops->exit(&init_net); -} -#endif - -static DEFINE_IDA(net_generic_ids); - -/** - * register_pernet_subsys - register a network namespace subsystem - * @ops: pernet operations structure for the subsystem - * - * Register a subsystem which has init and exit functions - * that are called when network namespaces are created and - * destroyed respectively. - * - * When registered all network namespace init functions are - * called for every existing network namespace. Allowing kernel - * modules to have a race free view of the set of network namespaces. - * - * When a new network namespace is created all of the init - * methods are called in the order in which they were registered. - * - * When a network namespace is destroyed all of the exit methods - * are called in the reverse of the order with which they were - * registered. - */ -int register_pernet_subsys(struct pernet_operations *ops) -{ - int error; - mutex_lock(&net_mutex); - error = register_pernet_operations(first_device, ops); - mutex_unlock(&net_mutex); - return error; -} -EXPORT_SYMBOL_GPL(register_pernet_subsys); - -/** - * unregister_pernet_subsys - unregister a network namespace subsystem - * @ops: pernet operations structure to manipulate - * - * Remove the pernet operations structure from the list to be - * used when network namespaces are created or destroyed. In - * addition run the exit method for all existing network - * namespaces. - */ -void unregister_pernet_subsys(struct pernet_operations *module) -{ - mutex_lock(&net_mutex); - unregister_pernet_operations(module); - mutex_unlock(&net_mutex); -} -EXPORT_SYMBOL_GPL(unregister_pernet_subsys); - -int register_pernet_gen_subsys(int *id, struct pernet_operations *ops) -{ - int rv; - - mutex_lock(&net_mutex); -again: - rv = ida_get_new_above(&net_generic_ids, 1, id); - if (rv < 0) { - if (rv == -EAGAIN) { - ida_pre_get(&net_generic_ids, GFP_KERNEL); - goto again; - } - goto out; - } - rv = register_pernet_operations(first_device, ops); - if (rv < 0) - ida_remove(&net_generic_ids, *id); -out: - mutex_unlock(&net_mutex); - return rv; -} -EXPORT_SYMBOL_GPL(register_pernet_gen_subsys); - -void unregister_pernet_gen_subsys(int id, struct pernet_operations *ops) -{ - mutex_lock(&net_mutex); - unregister_pernet_operations(ops); - ida_remove(&net_generic_ids, id); - mutex_unlock(&net_mutex); -} -EXPORT_SYMBOL_GPL(unregister_pernet_gen_subsys); - -/** - * register_pernet_device - register a network namespace device - * @ops: pernet operations structure for the subsystem - * - * Register a device which has init and exit functions - * that are called when network namespaces are created and - * destroyed respectively. - * - * When registered all network namespace init functions are - * called for every existing network namespace. Allowing kernel - * modules to have a race free view of the set of network namespaces. - * - * When a new network namespace is created all of the init - * methods are called in the order in which they were registered. - * - * When a network namespace is destroyed all of the exit methods - * are called in the reverse of the order with which they were - * registered. - */ -int register_pernet_device(struct pernet_operations *ops) -{ - int error; - mutex_lock(&net_mutex); - error = register_pernet_operations(&pernet_list, ops); - if (!error && (first_device == &pernet_list)) - first_device = &ops->list; - mutex_unlock(&net_mutex); - return error; -} -EXPORT_SYMBOL_GPL(register_pernet_device); - -int register_pernet_gen_device(int *id, struct pernet_operations *ops) -{ - int error; - mutex_lock(&net_mutex); -again: - error = ida_get_new_above(&net_generic_ids, 1, id); - if (error) { - if (error == -EAGAIN) { - ida_pre_get(&net_generic_ids, GFP_KERNEL); - goto again; - } - goto out; - } - error = register_pernet_operations(&pernet_list, ops); - if (error) - ida_remove(&net_generic_ids, *id); - else if (first_device == &pernet_list) - first_device = &ops->list; -out: - mutex_unlock(&net_mutex); - return error; -} -EXPORT_SYMBOL_GPL(register_pernet_gen_device); - -/** - * unregister_pernet_device - unregister a network namespace netdevice - * @ops: pernet operations structure to manipulate - * - * Remove the pernet operations structure from the list to be - * used when network namespaces are created or destroyed. In - * addition run the exit method for all existing network - * namespaces. - */ -void unregister_pernet_device(struct pernet_operations *ops) -{ - mutex_lock(&net_mutex); - if (&ops->list == first_device) - first_device = first_device->next; - unregister_pernet_operations(ops); - mutex_unlock(&net_mutex); -} -EXPORT_SYMBOL_GPL(unregister_pernet_device); - -void unregister_pernet_gen_device(int id, struct pernet_operations *ops) -{ - mutex_lock(&net_mutex); - if (&ops->list == first_device) - first_device = first_device->next; - unregister_pernet_operations(ops); - ida_remove(&net_generic_ids, id); - mutex_unlock(&net_mutex); -} -EXPORT_SYMBOL_GPL(unregister_pernet_gen_device); - -static void net_generic_release(struct rcu_head *rcu) -{ - struct net_generic *ng; - - ng = container_of(rcu, struct net_generic, rcu); - kfree(ng); -} - -int net_assign_generic(struct net *net, int id, void *data) -{ - struct net_generic *ng, *old_ng; - - BUG_ON(!mutex_is_locked(&net_mutex)); - BUG_ON(id == 0); - - ng = old_ng = net->gen; - if (old_ng->len >= id) - goto assign; - - ng = kzalloc(sizeof(struct net_generic) + - id * sizeof(void *), GFP_KERNEL); - if (ng == NULL) - return -ENOMEM; - - /* - * Some synchronisation notes: - * - * The net_generic explores the net->gen array inside rcu - * read section. Besides once set the net->gen->ptr[x] - * pointer never changes (see rules in netns/generic.h). - * - * That said, we simply duplicate this array and schedule - * the old copy for kfree after a grace period. - */ - - ng->len = id; - memcpy(&ng->ptr, &old_ng->ptr, old_ng->len); - - rcu_assign_pointer(net->gen, ng); - call_rcu(&old_ng->rcu, net_generic_release); -assign: - ng->ptr[id - 1] = data; - return 0; -} -EXPORT_SYMBOL_GPL(net_assign_generic); diff --git a/libdde_linux26/lib/src/net/core/.svn/text-base/rtnetlink.c.svn-base b/libdde_linux26/lib/src/net/core/.svn/text-base/rtnetlink.c.svn-base deleted file mode 100644 index 8408e3da..00000000 --- a/libdde_linux26/lib/src/net/core/.svn/text-base/rtnetlink.c.svn-base +++ /dev/null @@ -1,1436 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * Routing netlink socket interface: protocol independent part. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Fixes: - * Vitaly E. Lavrov RTA_OK arithmetics was wrong. - */ - -#include <linux/errno.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/socket.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <linux/string.h> -#include <linux/sockios.h> -#include <linux/net.h> -#include <linux/fcntl.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/interrupt.h> -#include <linux/capability.h> -#include <linux/skbuff.h> -#include <linux/init.h> -#include <linux/security.h> -#include <linux/mutex.h> -#include <linux/if_addr.h> -#include <linux/nsproxy.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/string.h> - -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <net/ip.h> -#include <net/protocol.h> -#include <net/arp.h> -#include <net/route.h> -#include <net/udp.h> -#include <net/sock.h> -#include <net/pkt_sched.h> -#include <net/fib_rules.h> -#include <net/rtnetlink.h> - -struct rtnl_link -{ - rtnl_doit_func doit; - rtnl_dumpit_func dumpit; -}; - -static DEFINE_MUTEX(rtnl_mutex); - -void rtnl_lock(void) -{ - mutex_lock(&rtnl_mutex); -} - -void __rtnl_unlock(void) -{ - mutex_unlock(&rtnl_mutex); -} - -void rtnl_unlock(void) -{ - /* This fellow will unlock it for us. */ - netdev_run_todo(); -} - -int rtnl_trylock(void) -{ - return mutex_trylock(&rtnl_mutex); -} - -int rtnl_is_locked(void) -{ - return mutex_is_locked(&rtnl_mutex); -} - -static struct rtnl_link *rtnl_msg_handlers[NPROTO]; - -static inline int rtm_msgindex(int msgtype) -{ - int msgindex = msgtype - RTM_BASE; - - /* - * msgindex < 0 implies someone tried to register a netlink - * control code. msgindex >= RTM_NR_MSGTYPES may indicate that - * the message type has not been added to linux/rtnetlink.h - */ - BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); - - return msgindex; -} - -static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex) -{ - struct rtnl_link *tab; - - tab = rtnl_msg_handlers[protocol]; - if (tab == NULL || tab[msgindex].doit == NULL) - tab = rtnl_msg_handlers[PF_UNSPEC]; - - return tab ? tab[msgindex].doit : NULL; -} - -static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) -{ - struct rtnl_link *tab; - - tab = rtnl_msg_handlers[protocol]; - if (tab == NULL || tab[msgindex].dumpit == NULL) - tab = rtnl_msg_handlers[PF_UNSPEC]; - - return tab ? tab[msgindex].dumpit : NULL; -} - -/** - * __rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. - * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit) -{ - struct rtnl_link *tab; - int msgindex; - - BUG_ON(protocol < 0 || protocol >= NPROTO); - msgindex = rtm_msgindex(msgtype); - - tab = rtnl_msg_handlers[protocol]; - if (tab == NULL) { - tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); - if (tab == NULL) - return -ENOBUFS; - - rtnl_msg_handlers[protocol] = tab; - } - - if (doit) - tab[msgindex].doit = doit; - - if (dumpit) - tab[msgindex].dumpit = dumpit; - - return 0; -} - -EXPORT_SYMBOL_GPL(__rtnl_register); - -/** - * rtnl_register - Register a rtnetlink message type - * - * Identical to __rtnl_register() but panics on failure. This is useful - * as failure of this function is very unlikely, it can only happen due - * to lack of memory when allocating the chain to store all message - * handlers for a protocol. Meant for use in init functions where lack - * of memory implies no sense in continueing. - */ -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit) -{ - if (__rtnl_register(protocol, msgtype, doit, dumpit) < 0) - panic("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", - protocol, msgtype); -} - -EXPORT_SYMBOL_GPL(rtnl_register); - -/** - * rtnl_unregister - Unregister a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * - * Returns 0 on success or a negative error code. - */ -int rtnl_unregister(int protocol, int msgtype) -{ - int msgindex; - - BUG_ON(protocol < 0 || protocol >= NPROTO); - msgindex = rtm_msgindex(msgtype); - - if (rtnl_msg_handlers[protocol] == NULL) - return -ENOENT; - - rtnl_msg_handlers[protocol][msgindex].doit = NULL; - rtnl_msg_handlers[protocol][msgindex].dumpit = NULL; - - return 0; -} - -EXPORT_SYMBOL_GPL(rtnl_unregister); - -/** - * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol - * @protocol : Protocol family or PF_UNSPEC - * - * Identical to calling rtnl_unregster() for all registered message types - * of a certain protocol family. - */ -void rtnl_unregister_all(int protocol) -{ - BUG_ON(protocol < 0 || protocol >= NPROTO); - - kfree(rtnl_msg_handlers[protocol]); - rtnl_msg_handlers[protocol] = NULL; -} - -EXPORT_SYMBOL_GPL(rtnl_unregister_all); - -static LIST_HEAD(link_ops); - -/** - * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. - * @ops: struct rtnl_link_ops * to register - * - * The caller must hold the rtnl_mutex. This function should be used - * by drivers that create devices during module initialization. It - * must be called before registering the devices. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_link_register(struct rtnl_link_ops *ops) -{ - if (!ops->dellink) - ops->dellink = unregister_netdevice; - - list_add_tail(&ops->list, &link_ops); - return 0; -} - -EXPORT_SYMBOL_GPL(__rtnl_link_register); - -/** - * rtnl_link_register - Register rtnl_link_ops with rtnetlink. - * @ops: struct rtnl_link_ops * to register - * - * Returns 0 on success or a negative error code. - */ -int rtnl_link_register(struct rtnl_link_ops *ops) -{ - int err; - - rtnl_lock(); - err = __rtnl_link_register(ops); - rtnl_unlock(); - return err; -} - -EXPORT_SYMBOL_GPL(rtnl_link_register); - -static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) -{ - struct net_device *dev; -restart: - for_each_netdev(net, dev) { - if (dev->rtnl_link_ops == ops) { - ops->dellink(dev); - goto restart; - } - } -} - -void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) -{ - rtnl_lock(); - __rtnl_kill_links(net, ops); - rtnl_unlock(); -} -EXPORT_SYMBOL_GPL(rtnl_kill_links); - -/** - * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. - * @ops: struct rtnl_link_ops * to unregister - * - * The caller must hold the rtnl_mutex. - */ -void __rtnl_link_unregister(struct rtnl_link_ops *ops) -{ - struct net *net; - - for_each_net(net) { - __rtnl_kill_links(net, ops); - } - list_del(&ops->list); -} - -EXPORT_SYMBOL_GPL(__rtnl_link_unregister); - -/** - * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. - * @ops: struct rtnl_link_ops * to unregister - */ -void rtnl_link_unregister(struct rtnl_link_ops *ops) -{ - rtnl_lock(); - __rtnl_link_unregister(ops); - rtnl_unlock(); -} - -EXPORT_SYMBOL_GPL(rtnl_link_unregister); - -static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) -{ - const struct rtnl_link_ops *ops; - - list_for_each_entry(ops, &link_ops, list) { - if (!strcmp(ops->kind, kind)) - return ops; - } - return NULL; -} - -static size_t rtnl_link_get_size(const struct net_device *dev) -{ - const struct rtnl_link_ops *ops = dev->rtnl_link_ops; - size_t size; - - if (!ops) - return 0; - - size = nlmsg_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ - nlmsg_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ - - if (ops->get_size) - /* IFLA_INFO_DATA + nested data */ - size += nlmsg_total_size(sizeof(struct nlattr)) + - ops->get_size(dev); - - if (ops->get_xstats_size) - size += ops->get_xstats_size(dev); /* IFLA_INFO_XSTATS */ - - return size; -} - -static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) -{ - const struct rtnl_link_ops *ops = dev->rtnl_link_ops; - struct nlattr *linkinfo, *data; - int err = -EMSGSIZE; - - linkinfo = nla_nest_start(skb, IFLA_LINKINFO); - if (linkinfo == NULL) - goto out; - - if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) - goto err_cancel_link; - if (ops->fill_xstats) { - err = ops->fill_xstats(skb, dev); - if (err < 0) - goto err_cancel_link; - } - if (ops->fill_info) { - data = nla_nest_start(skb, IFLA_INFO_DATA); - if (data == NULL) - goto err_cancel_link; - err = ops->fill_info(skb, dev); - if (err < 0) - goto err_cancel_data; - nla_nest_end(skb, data); - } - - nla_nest_end(skb, linkinfo); - return 0; - -err_cancel_data: - nla_nest_cancel(skb, data); -err_cancel_link: - nla_nest_cancel(skb, linkinfo); -out: - return err; -} - -static const int rtm_min[RTM_NR_FAMILIES] = -{ - [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)), - [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), - [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)), - [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)), - [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)), - [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)), - [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)), - [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)), - [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), - [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), -}; - -static const int rta_max[RTM_NR_FAMILIES] = -{ - [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX, - [RTM_FAM(RTM_NEWADDR)] = IFA_MAX, - [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX, - [RTM_FAM(RTM_NEWRULE)] = FRA_MAX, - [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX, - [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX, - [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX, - [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, -}; - -#ifndef DDE_LINUX -void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) -{ - struct rtattr *rta; - int size = RTA_LENGTH(attrlen); - - rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); - rta->rta_type = attrtype; - rta->rta_len = size; - memcpy(RTA_DATA(rta), data, attrlen); - memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); -} - -int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo) -{ - struct sock *rtnl = net->rtnl; - int err = 0; - - NETLINK_CB(skb).dst_group = group; - if (echo) - atomic_inc(&skb->users); - netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); - if (echo) - err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); - return err; -} - -int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) -{ - struct sock *rtnl = net->rtnl; - - return nlmsg_unicast(rtnl, skb, pid); -} - -int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, - struct nlmsghdr *nlh, gfp_t flags) -{ - struct sock *rtnl = net->rtnl; - int report = 0; - - if (nlh) - report = nlmsg_report(nlh); - - return nlmsg_notify(rtnl, skb, pid, group, report, flags); -} - -void rtnl_set_sk_err(struct net *net, u32 group, int error) -{ - struct sock *rtnl = net->rtnl; - - netlink_set_err(rtnl, 0, group, error); -} - -int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) -{ - struct nlattr *mx; - int i, valid = 0; - - mx = nla_nest_start(skb, RTA_METRICS); - if (mx == NULL) - return -ENOBUFS; - - for (i = 0; i < RTAX_MAX; i++) { - if (metrics[i]) { - valid++; - NLA_PUT_U32(skb, i+1, metrics[i]); - } - } - - if (!valid) { - nla_nest_cancel(skb, mx); - return 0; - } - - return nla_nest_end(skb, mx); - -nla_put_failure: - nla_nest_cancel(skb, mx); - return -EMSGSIZE; -} - -int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, - u32 ts, u32 tsage, long expires, u32 error) -{ - struct rta_cacheinfo ci = { - .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), - .rta_used = dst->__use, - .rta_clntref = atomic_read(&(dst->__refcnt)), - .rta_error = error, - .rta_id = id, - .rta_ts = ts, - .rta_tsage = tsage, - }; - - if (expires) - ci.rta_expires = jiffies_to_clock_t(expires); - - return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); -} - -EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); - -static void set_operstate(struct net_device *dev, unsigned char transition) -{ - unsigned char operstate = dev->operstate; - - switch(transition) { - case IF_OPER_UP: - if ((operstate == IF_OPER_DORMANT || - operstate == IF_OPER_UNKNOWN) && - !netif_dormant(dev)) - operstate = IF_OPER_UP; - break; - - case IF_OPER_DORMANT: - if (operstate == IF_OPER_UP || - operstate == IF_OPER_UNKNOWN) - operstate = IF_OPER_DORMANT; - break; - } - - if (dev->operstate != operstate) { - write_lock_bh(&dev_base_lock); - dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); - netdev_state_change(dev); - } -} - -static void copy_rtnl_link_stats(struct rtnl_link_stats *a, - const struct net_device_stats *b) -{ - a->rx_packets = b->rx_packets; - a->tx_packets = b->tx_packets; - a->rx_bytes = b->rx_bytes; - a->tx_bytes = b->tx_bytes; - a->rx_errors = b->rx_errors; - a->tx_errors = b->tx_errors; - a->rx_dropped = b->rx_dropped; - a->tx_dropped = b->tx_dropped; - - a->multicast = b->multicast; - a->collisions = b->collisions; - - a->rx_length_errors = b->rx_length_errors; - a->rx_over_errors = b->rx_over_errors; - a->rx_crc_errors = b->rx_crc_errors; - a->rx_frame_errors = b->rx_frame_errors; - a->rx_fifo_errors = b->rx_fifo_errors; - a->rx_missed_errors = b->rx_missed_errors; - - a->tx_aborted_errors = b->tx_aborted_errors; - a->tx_carrier_errors = b->tx_carrier_errors; - a->tx_fifo_errors = b->tx_fifo_errors; - a->tx_heartbeat_errors = b->tx_heartbeat_errors; - a->tx_window_errors = b->tx_window_errors; - - a->rx_compressed = b->rx_compressed; - a->tx_compressed = b->tx_compressed; -}; - -static inline size_t if_nlmsg_size(const struct net_device *dev) -{ - return NLMSG_ALIGN(sizeof(struct ifinfomsg)) - + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ - + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ - + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ - + nla_total_size(sizeof(struct rtnl_link_ifmap)) - + nla_total_size(sizeof(struct rtnl_link_stats)) - + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ - + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ - + nla_total_size(4) /* IFLA_TXQLEN */ - + nla_total_size(4) /* IFLA_WEIGHT */ - + nla_total_size(4) /* IFLA_MTU */ - + nla_total_size(4) /* IFLA_LINK */ - + nla_total_size(4) /* IFLA_MASTER */ - + nla_total_size(1) /* IFLA_OPERSTATE */ - + nla_total_size(1) /* IFLA_LINKMODE */ - + rtnl_link_get_size(dev); /* IFLA_LINKINFO */ -} - -static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, - int type, u32 pid, u32 seq, u32 change, - unsigned int flags) -{ - struct netdev_queue *txq; - struct ifinfomsg *ifm; - struct nlmsghdr *nlh; - const struct net_device_stats *stats; - struct nlattr *attr; - - nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); - if (nlh == NULL) - return -EMSGSIZE; - - ifm = nlmsg_data(nlh); - ifm->ifi_family = AF_UNSPEC; - ifm->__ifi_pad = 0; - ifm->ifi_type = dev->type; - ifm->ifi_index = dev->ifindex; - ifm->ifi_flags = dev_get_flags(dev); - ifm->ifi_change = change; - - NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); - NLA_PUT_U32(skb, IFLA_TXQLEN, dev->tx_queue_len); - NLA_PUT_U8(skb, IFLA_OPERSTATE, - netif_running(dev) ? dev->operstate : IF_OPER_DOWN); - NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode); - NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); - - if (dev->ifindex != dev->iflink) - NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); - - if (dev->master) - NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex); - - txq = netdev_get_tx_queue(dev, 0); - if (txq->qdisc_sleeping) - NLA_PUT_STRING(skb, IFLA_QDISC, txq->qdisc_sleeping->ops->id); - - if (dev->ifalias) - NLA_PUT_STRING(skb, IFLA_IFALIAS, dev->ifalias); - - if (1) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; - NLA_PUT(skb, IFLA_MAP, sizeof(map), &map); - } - - if (dev->addr_len) { - NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); - NLA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); - } - - attr = nla_reserve(skb, IFLA_STATS, - sizeof(struct rtnl_link_stats)); - if (attr == NULL) - goto nla_put_failure; - - stats = dev_get_stats(dev); - copy_rtnl_link_stats(nla_data(attr), stats); - - if (dev->rtnl_link_ops) { - if (rtnl_link_fill(skb, dev) < 0) - goto nla_put_failure; - } - - return nlmsg_end(skb, nlh); - -nla_put_failure: - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; -} - -static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - int idx; - int s_idx = cb->args[0]; - struct net_device *dev; - - idx = 0; - for_each_netdev(net, dev) { - if (idx < s_idx) - goto cont; - if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, 0, NLM_F_MULTI) <= 0) - break; -cont: - idx++; - } - cb->args[0] = idx; - - return skb->len; -} - -const struct nla_policy ifla_policy[IFLA_MAX+1] = { - [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, - [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, - [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, - [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, - [IFLA_MTU] = { .type = NLA_U32 }, - [IFLA_LINK] = { .type = NLA_U32 }, - [IFLA_TXQLEN] = { .type = NLA_U32 }, - [IFLA_WEIGHT] = { .type = NLA_U32 }, - [IFLA_OPERSTATE] = { .type = NLA_U8 }, - [IFLA_LINKMODE] = { .type = NLA_U8 }, - [IFLA_LINKINFO] = { .type = NLA_NESTED }, - [IFLA_NET_NS_PID] = { .type = NLA_U32 }, - [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, -}; - -static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { - [IFLA_INFO_KIND] = { .type = NLA_STRING }, - [IFLA_INFO_DATA] = { .type = NLA_NESTED }, -}; - -static struct net *get_net_ns_by_pid(pid_t pid) -{ - struct task_struct *tsk; - struct net *net; - - /* Lookup the network namespace */ - net = ERR_PTR(-ESRCH); - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk) { - struct nsproxy *nsproxy; - nsproxy = task_nsproxy(tsk); - if (nsproxy) - net = get_net(nsproxy->net_ns); - } - rcu_read_unlock(); - return net; -} - -static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) -{ - if (dev) { - if (tb[IFLA_ADDRESS] && - nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) - return -EINVAL; - - if (tb[IFLA_BROADCAST] && - nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) - return -EINVAL; - } - - return 0; -} - -static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, - struct nlattr **tb, char *ifname, int modified) -{ - const struct net_device_ops *ops = dev->netdev_ops; - int send_addr_notify = 0; - int err; - - if (tb[IFLA_NET_NS_PID]) { - struct net *net; - net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); - if (IS_ERR(net)) { - err = PTR_ERR(net); - goto errout; - } - err = dev_change_net_namespace(dev, net, ifname); - put_net(net); - if (err) - goto errout; - modified = 1; - } - - if (tb[IFLA_MAP]) { - struct rtnl_link_ifmap *u_map; - struct ifmap k_map; - - if (!ops->ndo_set_config) { - err = -EOPNOTSUPP; - goto errout; - } - - if (!netif_device_present(dev)) { - err = -ENODEV; - goto errout; - } - - u_map = nla_data(tb[IFLA_MAP]); - k_map.mem_start = (unsigned long) u_map->mem_start; - k_map.mem_end = (unsigned long) u_map->mem_end; - k_map.base_addr = (unsigned short) u_map->base_addr; - k_map.irq = (unsigned char) u_map->irq; - k_map.dma = (unsigned char) u_map->dma; - k_map.port = (unsigned char) u_map->port; - - err = ops->ndo_set_config(dev, &k_map); - if (err < 0) - goto errout; - - modified = 1; - } - - if (tb[IFLA_ADDRESS]) { - struct sockaddr *sa; - int len; - - if (!ops->ndo_set_mac_address) { - err = -EOPNOTSUPP; - goto errout; - } - - if (!netif_device_present(dev)) { - err = -ENODEV; - goto errout; - } - - len = sizeof(sa_family_t) + dev->addr_len; - sa = kmalloc(len, GFP_KERNEL); - if (!sa) { - err = -ENOMEM; - goto errout; - } - sa->sa_family = dev->type; - memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), - dev->addr_len); - err = ops->ndo_set_mac_address(dev, sa); - kfree(sa); - if (err) - goto errout; - send_addr_notify = 1; - modified = 1; - } - - if (tb[IFLA_MTU]) { - err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); - if (err < 0) - goto errout; - modified = 1; - } - - /* - * Interface selected by interface index but interface - * name provided implies that a name change has been - * requested. - */ - if (ifm->ifi_index > 0 && ifname[0]) { - err = dev_change_name(dev, ifname); - if (err < 0) - goto errout; - modified = 1; - } - - if (tb[IFLA_IFALIAS]) { - err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), - nla_len(tb[IFLA_IFALIAS])); - if (err < 0) - goto errout; - modified = 1; - } - - if (tb[IFLA_BROADCAST]) { - nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); - send_addr_notify = 1; - } - - if (ifm->ifi_flags || ifm->ifi_change) { - unsigned int flags = ifm->ifi_flags; - - /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ - if (ifm->ifi_change) - flags = (flags & ifm->ifi_change) | - (dev->flags & ~ifm->ifi_change); - err = dev_change_flags(dev, flags); - if (err < 0) - goto errout; - } - - if (tb[IFLA_TXQLEN]) - dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); - - if (tb[IFLA_OPERSTATE]) - set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); - - if (tb[IFLA_LINKMODE]) { - write_lock_bh(&dev_base_lock); - dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); - write_unlock_bh(&dev_base_lock); - } - - err = 0; - -errout: - if (err < 0 && modified && net_ratelimit()) - printk(KERN_WARNING "A link change request failed with " - "some changes comitted already. Interface %s may " - "have been left with an inconsistent configuration, " - "please check.\n", dev->name); - - if (send_addr_notify) - call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - return err; -} - -static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) -{ - struct net *net = sock_net(skb->sk); - struct ifinfomsg *ifm; - struct net_device *dev; - int err; - struct nlattr *tb[IFLA_MAX+1]; - char ifname[IFNAMSIZ]; - - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); - if (err < 0) - goto errout; - - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - else - ifname[0] = '\0'; - - err = -EINVAL; - ifm = nlmsg_data(nlh); - if (ifm->ifi_index > 0) - dev = dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = dev_get_by_name(net, ifname); - else - goto errout; - - if (dev == NULL) { - err = -ENODEV; - goto errout; - } - - if ((err = validate_linkmsg(dev, tb)) < 0) - goto errout_dev; - - err = do_setlink(dev, ifm, tb, ifname, 0); -errout_dev: - dev_put(dev); -errout: - return err; -} - -static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) -{ - struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; - struct net_device *dev; - struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; - struct nlattr *tb[IFLA_MAX+1]; - int err; - - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); - if (err < 0) - return err; - - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - - ifm = nlmsg_data(nlh); - if (ifm->ifi_index > 0) - dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); - else - return -EINVAL; - - if (!dev) - return -ENODEV; - - ops = dev->rtnl_link_ops; - if (!ops) - return -EOPNOTSUPP; - - ops->dellink(dev); - return 0; -} - -struct net_device *rtnl_create_link(struct net *net, char *ifname, - const struct rtnl_link_ops *ops, struct nlattr *tb[]) -{ - int err; - struct net_device *dev; - - err = -ENOMEM; - dev = alloc_netdev(ops->priv_size, ifname, ops->setup); - if (!dev) - goto err; - - if (strchr(dev->name, '%')) { - err = dev_alloc_name(dev, dev->name); - if (err < 0) - goto err_free; - } - - dev_net_set(dev, net); - dev->rtnl_link_ops = ops; - - if (tb[IFLA_MTU]) - dev->mtu = nla_get_u32(tb[IFLA_MTU]); - if (tb[IFLA_ADDRESS]) - memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), - nla_len(tb[IFLA_ADDRESS])); - if (tb[IFLA_BROADCAST]) - memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), - nla_len(tb[IFLA_BROADCAST])); - if (tb[IFLA_TXQLEN]) - dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); - if (tb[IFLA_OPERSTATE]) - set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); - if (tb[IFLA_LINKMODE]) - dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); - - return dev; - -err_free: - free_netdev(dev); -err: - return ERR_PTR(err); -} - -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) -{ - struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; - struct net_device *dev; - struct ifinfomsg *ifm; - char kind[MODULE_NAME_LEN]; - char ifname[IFNAMSIZ]; - struct nlattr *tb[IFLA_MAX+1]; - struct nlattr *linkinfo[IFLA_INFO_MAX+1]; - int err; - -#ifdef CONFIG_MODULES -replay: -#endif - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); - if (err < 0) - return err; - - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - else - ifname[0] = '\0'; - - ifm = nlmsg_data(nlh); - if (ifm->ifi_index > 0) - dev = __dev_get_by_index(net, ifm->ifi_index); - else if (ifname[0]) - dev = __dev_get_by_name(net, ifname); - else - dev = NULL; - - if ((err = validate_linkmsg(dev, tb)) < 0) - return err; - - if (tb[IFLA_LINKINFO]) { - err = nla_parse_nested(linkinfo, IFLA_INFO_MAX, - tb[IFLA_LINKINFO], ifla_info_policy); - if (err < 0) - return err; - } else - memset(linkinfo, 0, sizeof(linkinfo)); - - if (linkinfo[IFLA_INFO_KIND]) { - nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); - } else { - kind[0] = '\0'; - ops = NULL; - } - - if (1) { - struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; - - if (ops) { - if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested(attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy); - if (err < 0) - return err; - data = attr; - } - if (ops->validate) { - err = ops->validate(tb, data); - if (err < 0) - return err; - } - } - - if (dev) { - int modified = 0; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; - - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; - - err = ops->changelink(dev, tb, data); - if (err < 0) - return err; - modified = 1; - } - - return do_setlink(dev, ifm, tb, ifname, modified); - } - - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) - return -ENODEV; - - if (ifm->ifi_index || ifm->ifi_flags || ifm->ifi_change) - return -EOPNOTSUPP; - if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; - - if (!ops) { -#ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; - } -#endif - return -EOPNOTSUPP; - } - - if (!ifname[0]) - snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - - dev = rtnl_create_link(net, ifname, ops, tb); - - if (IS_ERR(dev)) - err = PTR_ERR(dev); - else if (ops->newlink) - err = ops->newlink(dev, tb, data); - else - err = register_netdevice(dev); - - if (err < 0 && !IS_ERR(dev)) - free_netdev(dev); - return err; - } -} - -static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) -{ - struct net *net = sock_net(skb->sk); - struct ifinfomsg *ifm; - struct nlattr *tb[IFLA_MAX+1]; - struct net_device *dev = NULL; - struct sk_buff *nskb; - int err; - - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); - if (err < 0) - return err; - - ifm = nlmsg_data(nlh); - if (ifm->ifi_index > 0) { - dev = dev_get_by_index(net, ifm->ifi_index); - if (dev == NULL) - return -ENODEV; - } else - return -EINVAL; - - nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); - if (nskb == NULL) { - err = -ENOBUFS; - goto errout; - } - - err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, - nlh->nlmsg_seq, 0, 0); - if (err < 0) { - /* -EMSGSIZE implies BUG in if_nlmsg_size */ - WARN_ON(err == -EMSGSIZE); - kfree_skb(nskb); - goto errout; - } - err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); -errout: - dev_put(dev); - - return err; -} - -static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) -{ - int idx; - int s_idx = cb->family; - - if (s_idx == 0) - s_idx = 1; - for (idx=1; idx<NPROTO; idx++) { - int type = cb->nlh->nlmsg_type-RTM_BASE; - if (idx < s_idx || idx == PF_PACKET) - continue; - if (rtnl_msg_handlers[idx] == NULL || - rtnl_msg_handlers[idx][type].dumpit == NULL) - continue; - if (idx > s_idx) - memset(&cb->args[0], 0, sizeof(cb->args)); - if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) - break; - } - cb->family = idx; - - return skb->len; -} -#endif /* DDE_LINUX */ - -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) -{ - struct net *net = dev_net(dev); -#ifndef DDE_LINUX - struct sk_buff *skb; - int err = -ENOBUFS; - - skb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); - if (skb == NULL) - goto errout; - - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0); - if (err < 0) { - /* -EMSGSIZE implies BUG in if_nlmsg_size() */ - WARN_ON(err == -EMSGSIZE); - kfree_skb(skb); - goto errout; - } - err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); -errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_LINK, err); -#endif /* DDE_LINUX */ -} - -#ifndef DDE_LINUX -/* Protected by RTNL sempahore. */ -static struct rtattr **rta_buf; -static int rtattr_max; - -/* Process one rtnetlink message. */ - -static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - struct net *net = sock_net(skb->sk); - rtnl_doit_func doit; - int sz_idx, kind; - int min_len; - int family; - int type; - int err; - - type = nlh->nlmsg_type; - if (type > RTM_MAX) - return -EOPNOTSUPP; - - type -= RTM_BASE; - - /* All the messages must have at least 1 byte length */ - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) - return 0; - - family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; - if (family >= NPROTO) - return -EAFNOSUPPORT; - - sz_idx = type>>2; - kind = type&3; - - if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) - return -EPERM; - - if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { - struct sock *rtnl; - rtnl_dumpit_func dumpit; - - dumpit = rtnl_get_dumpit(family, type); - if (dumpit == NULL) - return -EOPNOTSUPP; - - __rtnl_unlock(); - rtnl = net->rtnl; - err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL); - rtnl_lock(); - return err; - } - - memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); - - min_len = rtm_min[sz_idx]; - if (nlh->nlmsg_len < min_len) - return -EINVAL; - - if (nlh->nlmsg_len > min_len) { - int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); - struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); - - while (RTA_OK(attr, attrlen)) { - unsigned flavor = attr->rta_type; - if (flavor) { - if (flavor > rta_max[sz_idx]) - return -EINVAL; - rta_buf[flavor-1] = attr; - } - attr = RTA_NEXT(attr, attrlen); - } - } - - doit = rtnl_get_doit(family, type); - if (doit == NULL) - return -EOPNOTSUPP; - - return doit(skb, nlh, (void *)&rta_buf[0]); -} - -static void rtnetlink_rcv(struct sk_buff *skb) -{ - rtnl_lock(); - netlink_rcv_skb(skb, &rtnetlink_rcv_msg); - rtnl_unlock(); -} - -static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) -{ - struct net_device *dev = ptr; - - switch (event) { - case NETDEV_UNREGISTER: - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); - break; - case NETDEV_REGISTER: - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); - break; - case NETDEV_UP: - case NETDEV_DOWN: - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); - break; - case NETDEV_CHANGE: - case NETDEV_GOING_DOWN: - break; - default: - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block rtnetlink_dev_notifier = { - .notifier_call = rtnetlink_event, -}; - - -static int rtnetlink_net_init(struct net *net) -{ - struct sock *sk; - sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, - rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); - if (!sk) - return -ENOMEM; - net->rtnl = sk; - return 0; -} - -static void rtnetlink_net_exit(struct net *net) -{ - netlink_kernel_release(net->rtnl); - net->rtnl = NULL; -} - -static struct pernet_operations rtnetlink_net_ops = { - .init = rtnetlink_net_init, - .exit = rtnetlink_net_exit, -}; - -void __init rtnetlink_init(void) -{ - int i; - - rtattr_max = 0; - for (i = 0; i < ARRAY_SIZE(rta_max); i++) - if (rta_max[i] > rtattr_max) - rtattr_max = rta_max[i]; - rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL); - if (!rta_buf) - panic("rtnetlink_init: cannot allocate rta_buf\n"); - - if (register_pernet_subsys(&rtnetlink_net_ops)) - panic("rtnetlink_init: cannot initialize rtnetlink\n"); - - netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); - register_netdevice_notifier(&rtnetlink_dev_notifier); - - rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo); - rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL); - rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL); - rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL); - - rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all); - rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all); -} - -EXPORT_SYMBOL(__rta_fill); -EXPORT_SYMBOL(rtnetlink_put_metrics); -EXPORT_SYMBOL(rtnl_lock); -EXPORT_SYMBOL(rtnl_trylock); -EXPORT_SYMBOL(rtnl_unlock); -EXPORT_SYMBOL(rtnl_is_locked); -EXPORT_SYMBOL(rtnl_unicast); -EXPORT_SYMBOL(rtnl_notify); -EXPORT_SYMBOL(rtnl_set_sk_err); -EXPORT_SYMBOL(rtnl_create_link); -EXPORT_SYMBOL(ifla_policy); -#endif /* !DDE_LINUX */ diff --git a/libdde_linux26/lib/src/net/core/.svn/text-base/skbuff.c.svn-base b/libdde_linux26/lib/src/net/core/.svn/text-base/skbuff.c.svn-base deleted file mode 100644 index 59b275b0..00000000 --- a/libdde_linux26/lib/src/net/core/.svn/text-base/skbuff.c.svn-base +++ /dev/null @@ -1,2938 +0,0 @@ -/* - * Routines having to do with the 'struct sk_buff' memory handlers. - * - * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> - * Florian La Roche <rzsfl@rz.uni-sb.de> - * - * Fixes: - * Alan Cox : Fixed the worst of the load - * balancer bugs. - * Dave Platt : Interrupt stacking fix. - * Richard Kooijman : Timestamp fixes. - * Alan Cox : Changed buffer format. - * Alan Cox : destructor hook for AF_UNIX etc. - * Linus Torvalds : Better skb_clone. - * Alan Cox : Added skb_copy. - * Alan Cox : Added all the changed routines Linus - * only put in the headers - * Ray VanTassle : Fixed --skb->lock in free - * Alan Cox : skb_copy copy arp field - * Andi Kleen : slabified it. - * Robert Olsson : Removed skb_head_pool - * - * NOTE: - * The __skb_ routines should be called with interrupts - * disabled, or you better be *real* sure that the operation is atomic - * with respect to whatever list is being frobbed (e.g. via lock_sock() - * or via disabling bottom half handlers, etc). - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -/* - * The functions in this file will not compile correctly with gcc 2.4.x - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/interrupt.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/slab.h> -#include <linux/netdevice.h> -#ifdef CONFIG_NET_CLS_ACT -#include <net/pkt_sched.h> -#endif -#include <linux/string.h> -#include <linux/skbuff.h> -#include <linux/splice.h> -#include <linux/cache.h> -#include <linux/rtnetlink.h> -#include <linux/init.h> -#include <linux/scatterlist.h> - -#include <net/protocol.h> -#include <net/dst.h> -#include <net/sock.h> -#include <net/checksum.h> -#ifndef DDE_LINUX -#include <net/xfrm.h> -#endif /* DDE_LINUX */ - -#include "local.h" - -#include <asm/uaccess.h> -#include <asm/system.h> - -#include "kmap_skb.h" - -static struct kmem_cache *skbuff_head_cache __read_mostly; -static struct kmem_cache *skbuff_fclone_cache __read_mostly; - -static void sock_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - put_page(buf->page); -} - -static void sock_pipe_buf_get(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - get_page(buf->page); -} - -static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - - -/* Pipe buffer operations for a socket. */ -static struct pipe_buf_operations sock_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = sock_pipe_buf_release, - .steal = sock_pipe_buf_steal, - .get = sock_pipe_buf_get, -}; - -/* - * Keep out-of-line to prevent kernel bloat. - * __builtin_return_address is not used because it is not always - * reliable. - */ - -/** - * skb_over_panic - private function - * @skb: buffer - * @sz: size - * @here: address - * - * Out of line support code for skb_put(). Not user callable. - */ -void skb_over_panic(struct sk_buff *skb, int sz, void *here) -{ - printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " - "data:%p tail:%#lx end:%#lx dev:%s\n", - here, skb->len, sz, skb->head, skb->data, - (unsigned long)skb->tail, (unsigned long)skb->end, - skb->dev ? skb->dev->name : "<NULL>"); - BUG(); -} - -/** - * skb_under_panic - private function - * @skb: buffer - * @sz: size - * @here: address - * - * Out of line support code for skb_push(). Not user callable. - */ - -void skb_under_panic(struct sk_buff *skb, int sz, void *here) -{ - printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " - "data:%p tail:%#lx end:%#lx dev:%s\n", - here, skb->len, sz, skb->head, skb->data, - (unsigned long)skb->tail, (unsigned long)skb->end, - skb->dev ? skb->dev->name : "<NULL>"); - BUG(); -} - -/* Allocate a new skbuff. We do this ourselves so we can fill in a few - * 'private' fields and also do memory statistics to find all the - * [BEEP] leaks. - * - */ - -/** - * __alloc_skb - allocate a network buffer - * @size: size to allocate - * @gfp_mask: allocation mask - * @fclone: allocate from fclone cache instead of head cache - * and allocate a cloned (child) skb - * @node: numa node to allocate memory on - * - * Allocate a new &sk_buff. The returned buffer has no headroom and a - * tail room of size bytes. The object has a reference count of one. - * The return is the buffer. On a failure the return is %NULL. - * - * Buffers may only be allocated from interrupts using a @gfp_mask of - * %GFP_ATOMIC. - */ -struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, - int fclone, int node) -{ - struct kmem_cache *cache; - struct skb_shared_info *shinfo; - struct sk_buff *skb; - u8 *data; - - cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; - - /* Get the HEAD */ - skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); - if (!skb) - goto out; - - size = SKB_DATA_ALIGN(size); - data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), - gfp_mask, node); - if (!data) - goto nodata; - - /* - * Only clear those fields we need to clear, not those that we will - * actually initialise below. Hence, don't put any more fields after - * the tail pointer in struct sk_buff! - */ - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->truesize = size + sizeof(struct sk_buff); - atomic_set(&skb->users, 1); - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - /* make sure we initialize shinfo sequentially */ - shinfo = skb_shinfo(skb); - atomic_set(&shinfo->dataref, 1); - shinfo->nr_frags = 0; - shinfo->gso_size = 0; - shinfo->gso_segs = 0; - shinfo->gso_type = 0; - shinfo->ip6_frag_id = 0; - shinfo->frag_list = NULL; - - if (fclone) { - struct sk_buff *child = skb + 1; - atomic_t *fclone_ref = (atomic_t *) (child + 1); - - skb->fclone = SKB_FCLONE_ORIG; - atomic_set(fclone_ref, 1); - - child->fclone = SKB_FCLONE_UNAVAILABLE; - } -out: - return skb; -nodata: - kmem_cache_free(cache, skb); - skb = NULL; - goto out; -} - -/** - * __netdev_alloc_skb - allocate an skbuff for rx on a specific device - * @dev: network device to receive on - * @length: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb - * - * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has unspecified headroom built in. Users should allocate - * the headroom they think they need without accounting for the - * built in space. The built in space is used for optimisations. - * - * %NULL is returned if there is no free memory. - */ -struct sk_buff *__netdev_alloc_skb(struct net_device *dev, - unsigned int length, gfp_t gfp_mask) -{ - int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; - struct sk_buff *skb; - - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node); - if (likely(skb)) { - skb_reserve(skb, NET_SKB_PAD); - skb->dev = dev; - } - return skb; -} - -struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask) -{ - int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; - struct page *page; - - page = alloc_pages_node(node, gfp_mask, 0); - return page; -} -EXPORT_SYMBOL(__netdev_alloc_page); - -void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, - int size) -{ - skb_fill_page_desc(skb, i, page, off, size); - skb->len += size; - skb->data_len += size; - skb->truesize += size; -} -EXPORT_SYMBOL(skb_add_rx_frag); - -/** - * dev_alloc_skb - allocate an skbuff for receiving - * @length: length to allocate - * - * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has unspecified headroom built in. Users should allocate - * the headroom they think they need without accounting for the - * built in space. The built in space is used for optimisations. - * - * %NULL is returned if there is no free memory. Although this function - * allocates memory it can be called from an interrupt. - */ -struct sk_buff *dev_alloc_skb(unsigned int length) -{ - /* - * There is more code here than it seems: - * __dev_alloc_skb is an inline - */ - return __dev_alloc_skb(length, GFP_ATOMIC); -} -EXPORT_SYMBOL(dev_alloc_skb); - -static void skb_drop_list(struct sk_buff **listp) -{ - struct sk_buff *list = *listp; - - *listp = NULL; - - do { - struct sk_buff *this = list; - list = list->next; - kfree_skb(this); - } while (list); -} - -static inline void skb_drop_fraglist(struct sk_buff *skb) -{ - skb_drop_list(&skb_shinfo(skb)->frag_list); -} - -static void skb_clone_fraglist(struct sk_buff *skb) -{ - struct sk_buff *list; - - for (list = skb_shinfo(skb)->frag_list; list; list = list->next) - skb_get(list); -} - -static void skb_release_data(struct sk_buff *skb) -{ - if (!skb->cloned || - !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, - &skb_shinfo(skb)->dataref)) { - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - put_page(skb_shinfo(skb)->frags[i].page); - } - - if (skb_shinfo(skb)->frag_list) - skb_drop_fraglist(skb); - - kfree(skb->head); - } -} - -/* - * Free an skbuff by memory without cleaning the state. - */ -static void kfree_skbmem(struct sk_buff *skb) -{ - struct sk_buff *other; - atomic_t *fclone_ref; - - switch (skb->fclone) { - case SKB_FCLONE_UNAVAILABLE: - kmem_cache_free(skbuff_head_cache, skb); - break; - - case SKB_FCLONE_ORIG: - fclone_ref = (atomic_t *) (skb + 2); - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, skb); - break; - - case SKB_FCLONE_CLONE: - fclone_ref = (atomic_t *) (skb + 1); - other = skb - 1; - - /* The clone portion is available for - * fast-cloning again. - */ - skb->fclone = SKB_FCLONE_UNAVAILABLE; - - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, other); - break; - } -} - -static void skb_release_head_state(struct sk_buff *skb) -{ -#ifndef DDE_LINUX - dst_release(skb->dst); -#endif -#ifdef CONFIG_XFRM - secpath_put(skb->sp); -#endif - if (skb->destructor) { - WARN_ON(in_irq()); - skb->destructor(skb); - } -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - nf_conntrack_put(skb->nfct); - nf_conntrack_put_reasm(skb->nfct_reasm); -#endif -#ifdef CONFIG_BRIDGE_NETFILTER - nf_bridge_put(skb->nf_bridge); -#endif -/* XXX: IS this still necessary? - JHS */ -#ifdef CONFIG_NET_SCHED - skb->tc_index = 0; -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = 0; -#endif -#endif -} - -/* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb) -{ - skb_release_head_state(skb); - skb_release_data(skb); -} - -/** - * __kfree_skb - private function - * @skb: buffer - * - * Free an sk_buff. Release anything attached to the buffer. - * Clean the state. This is an internal helper function. Users should - * always call kfree_skb - */ - -void __kfree_skb(struct sk_buff *skb) -{ - skb_release_all(skb); - kfree_skbmem(skb); -} - -/** - * kfree_skb - free an sk_buff - * @skb: buffer to free - * - * Drop a reference to the buffer and free it if the usage count has - * hit zero. - */ -void kfree_skb(struct sk_buff *skb) -{ - if (unlikely(!skb)) - return; - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) - return; - __kfree_skb(skb); -} - -/** - * skb_recycle_check - check if skb can be reused for receive - * @skb: buffer - * @skb_size: minimum receive buffer size - * - * Checks that the skb passed in is not shared or cloned, and - * that it is linear and its head portion at least as large as - * skb_size so that it can be recycled as a receive buffer. - * If these conditions are met, this function does any necessary - * reference count dropping and cleans up the skbuff as if it - * just came from __alloc_skb(). - */ -int skb_recycle_check(struct sk_buff *skb, int skb_size) -{ - struct skb_shared_info *shinfo; - - if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) - return 0; - - skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD); - if (skb_end_pointer(skb) - skb->head < skb_size) - return 0; - - if (skb_shared(skb) || skb_cloned(skb)) - return 0; - - skb_release_head_state(skb); - shinfo = skb_shinfo(skb); - atomic_set(&shinfo->dataref, 1); - shinfo->nr_frags = 0; - shinfo->gso_size = 0; - shinfo->gso_segs = 0; - shinfo->gso_type = 0; - shinfo->ip6_frag_id = 0; - shinfo->frag_list = NULL; - - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->data = skb->head + NET_SKB_PAD; - skb_reset_tail_pointer(skb); - - return 1; -} -EXPORT_SYMBOL(skb_recycle_check); - -static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) -{ - new->tstamp = old->tstamp; - new->dev = old->dev; - new->transport_header = old->transport_header; - new->network_header = old->network_header; - new->mac_header = old->mac_header; - new->dst = dst_clone(old->dst); -#ifdef CONFIG_XFRM - new->sp = secpath_get(old->sp); -#endif - memcpy(new->cb, old->cb, sizeof(old->cb)); - new->csum_start = old->csum_start; - new->csum_offset = old->csum_offset; - new->local_df = old->local_df; - new->pkt_type = old->pkt_type; - new->ip_summed = old->ip_summed; - skb_copy_queue_mapping(new, old); - new->priority = old->priority; -#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) - new->ipvs_property = old->ipvs_property; -#endif - new->protocol = old->protocol; - new->mark = old->mark; - __nf_copy(new, old); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) - new->nf_trace = old->nf_trace; -#endif -#ifdef CONFIG_NET_SCHED - new->tc_index = old->tc_index; -#ifdef CONFIG_NET_CLS_ACT - new->tc_verd = old->tc_verd; -#endif -#endif - new->vlan_tci = old->vlan_tci; - - skb_copy_secmark(new, old); -} - -static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) -{ -#define C(x) n->x = skb->x - - n->next = n->prev = NULL; - n->sk = NULL; - __copy_skb_header(n, skb); - - C(len); - C(data_len); - C(mac_len); - n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; - n->cloned = 1; - n->nohdr = 0; - n->destructor = NULL; - C(iif); - C(tail); - C(end); - C(head); - C(data); - C(truesize); -#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) - C(do_not_encrypt); - C(requeue); -#endif - atomic_set(&n->users, 1); - - atomic_inc(&(skb_shinfo(skb)->dataref)); - skb->cloned = 1; - - return n; -#undef C -} - -/** - * skb_morph - morph one skb into another - * @dst: the skb to receive the contents - * @src: the skb to supply the contents - * - * This is identical to skb_clone except that the target skb is - * supplied by the user. - * - * The target skb is returned upon exit. - */ -struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) -{ - skb_release_all(dst); - return __skb_clone(dst, src); -} -EXPORT_SYMBOL_GPL(skb_morph); - -/** - * skb_clone - duplicate an sk_buff - * @skb: buffer to clone - * @gfp_mask: allocation priority - * - * Duplicate an &sk_buff. The new one is not owned by a socket. Both - * copies share the same packet data but not structure. The new - * buffer has a reference count of 1. If the allocation fails the - * function returns %NULL otherwise the new buffer is returned. - * - * If this function is called from an interrupt gfp_mask() must be - * %GFP_ATOMIC. - */ - -struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) -{ - struct sk_buff *n; - - n = skb + 1; - if (skb->fclone == SKB_FCLONE_ORIG && - n->fclone == SKB_FCLONE_UNAVAILABLE) { - atomic_t *fclone_ref = (atomic_t *) (n + 1); - n->fclone = SKB_FCLONE_CLONE; - atomic_inc(fclone_ref); - } else { - n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); - if (!n) - return NULL; - n->fclone = SKB_FCLONE_UNAVAILABLE; - } - - return __skb_clone(n, skb); -} - -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) -{ -#ifndef NET_SKBUFF_DATA_USES_OFFSET - /* - * Shift between the two data areas in bytes - */ - unsigned long offset = new->data - old->data; -#endif - - __copy_skb_header(new, old); - -#ifndef NET_SKBUFF_DATA_USES_OFFSET - /* {transport,network,mac}_header are relative to skb->head */ - new->transport_header += offset; - new->network_header += offset; - new->mac_header += offset; -#endif - skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; - skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; - skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; -} - -/** - * skb_copy - create private copy of an sk_buff - * @skb: buffer to copy - * @gfp_mask: allocation priority - * - * Make a copy of both an &sk_buff and its data. This is used when the - * caller wishes to modify the data and needs a private copy of the - * data to alter. Returns %NULL on failure or the pointer to the buffer - * on success. The returned buffer has a reference count of 1. - * - * As by-product this function converts non-linear &sk_buff to linear - * one, so that &sk_buff becomes completely private and caller is allowed - * to modify all the data of returned buffer. This means that this - * function is not recommended for use in circumstances when only - * header is going to be modified. Use pskb_copy() instead. - */ - -struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) -{ - int headerlen = skb->data - skb->head; - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end + skb->data_len, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask); -#endif - if (!n) - return NULL; - - /* Set the data pointer */ - skb_reserve(n, headerlen); - /* Set the tail pointer and length */ - skb_put(n, skb->len); - - if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) - BUG(); - - copy_skb_header(n, skb); - return n; -} - - -/** - * pskb_copy - create copy of an sk_buff with private head. - * @skb: buffer to copy - * @gfp_mask: allocation priority - * - * Make a copy of both an &sk_buff and part of its data, located - * in header. Fragmented data remain shared. This is used when - * the caller wishes to modify only header of &sk_buff and needs - * private copy of the header to alter. Returns %NULL on failure - * or the pointer to the buffer on success. - * The returned buffer has a reference count of 1. - */ - -struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) -{ - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head, gfp_mask); -#endif - if (!n) - goto out; - - /* Set the data pointer */ - skb_reserve(n, skb->data - skb->head); - /* Set the tail pointer and length */ - skb_put(n, skb_headlen(skb)); - /* Copy the bytes */ - skb_copy_from_linear_data(skb, n->data, n->len); - - n->truesize += skb->data_len; - n->data_len = skb->data_len; - n->len = skb->len; - - if (skb_shinfo(skb)->nr_frags) { - int i; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; - get_page(skb_shinfo(n)->frags[i].page); - } - skb_shinfo(n)->nr_frags = i; - } - - if (skb_shinfo(skb)->frag_list) { - skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; - skb_clone_fraglist(n); - } - - copy_skb_header(n, skb); -out: - return n; -} - -/** - * pskb_expand_head - reallocate header of &sk_buff - * @skb: buffer to reallocate - * @nhead: room to add at head - * @ntail: room to add at tail - * @gfp_mask: allocation priority - * - * Expands (or creates identical copy, if &nhead and &ntail are zero) - * header of skb. &sk_buff itself is not changed. &sk_buff MUST have - * reference count of 1. Returns zero in the case of success or error, - * if expansion failed. In the last case, &sk_buff is not changed. - * - * All the pointers pointing into skb header may change and must be - * reloaded after call to this function. - */ - -int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, - gfp_t gfp_mask) -{ - int i; - u8 *data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - int size = nhead + skb->end + ntail; -#else - int size = nhead + (skb->end - skb->head) + ntail; -#endif - long off; - - BUG_ON(nhead < 0); - - if (skb_shared(skb)) - BUG(); - - size = SKB_DATA_ALIGN(size); - - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (!data) - goto nodata; - - /* Copy only real data... and, alas, header. This should be - * optimized for the cases when header is void. */ -#ifdef NET_SKBUFF_DATA_USES_OFFSET - memcpy(data + nhead, skb->head, skb->tail); -#else - memcpy(data + nhead, skb->head, skb->tail - skb->head); -#endif - memcpy(data + size, skb_end_pointer(skb), - sizeof(struct skb_shared_info)); - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - get_page(skb_shinfo(skb)->frags[i].page); - - if (skb_shinfo(skb)->frag_list) - skb_clone_fraglist(skb); - - skb_release_data(skb); - - off = (data + nhead) - skb->head; - - skb->head = data; - skb->data += off; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; - off = nhead; -#else - skb->end = skb->head + size; -#endif - /* {transport,network,mac}_header and tail are relative to skb->head */ - skb->tail += off; - skb->transport_header += off; - skb->network_header += off; - skb->mac_header += off; - skb->csum_start += nhead; - skb->cloned = 0; - skb->hdr_len = 0; - skb->nohdr = 0; - atomic_set(&skb_shinfo(skb)->dataref, 1); - return 0; - -nodata: - return -ENOMEM; -} - -/* Make private copy of skb with writable head and some headroom */ - -struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) -{ - struct sk_buff *skb2; - int delta = headroom - skb_headroom(skb); - - if (delta <= 0) - skb2 = pskb_copy(skb, GFP_ATOMIC); - else { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, - GFP_ATOMIC)) { - kfree_skb(skb2); - skb2 = NULL; - } - } - return skb2; -} - - -/** - * skb_copy_expand - copy and expand sk_buff - * @skb: buffer to copy - * @newheadroom: new free bytes at head - * @newtailroom: new free bytes at tail - * @gfp_mask: allocation priority - * - * Make a copy of both an &sk_buff and its data and while doing so - * allocate additional space. - * - * This is used when the caller wishes to modify the data and needs a - * private copy of the data to alter as well as more space for new fields. - * Returns %NULL on failure or the pointer to the buffer - * on success. The returned buffer has a reference count of 1. - * - * You must pass %GFP_ATOMIC as the allocation priority if this function - * is called from an interrupt. - */ -struct sk_buff *skb_copy_expand(const struct sk_buff *skb, - int newheadroom, int newtailroom, - gfp_t gfp_mask) -{ - /* - * Allocate the copy buffer - */ - struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask); - int oldheadroom = skb_headroom(skb); - int head_copy_len, head_copy_off; - int off; - - if (!n) - return NULL; - - skb_reserve(n, newheadroom); - - /* Set the tail pointer and length */ - skb_put(n, skb->len); - - head_copy_len = oldheadroom; - head_copy_off = 0; - if (newheadroom <= head_copy_len) - head_copy_len = newheadroom; - else - head_copy_off = newheadroom - head_copy_len; - - /* Copy the linear header and data. */ - if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, - skb->len + head_copy_len)) - BUG(); - - copy_skb_header(n, skb); - - off = newheadroom - oldheadroom; - n->csum_start += off; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n->transport_header += off; - n->network_header += off; - n->mac_header += off; -#endif - - return n; -} - -/** - * skb_pad - zero pad the tail of an skb - * @skb: buffer to pad - * @pad: space to pad - * - * Ensure that a buffer is followed by a padding area that is zero - * filled. Used by network drivers which may DMA or transfer data - * beyond the buffer end onto the wire. - * - * May return error in out of memory cases. The skb is freed on error. - */ - -int skb_pad(struct sk_buff *skb, int pad) -{ - int err; - int ntail; - - /* If the skbuff is non linear tailroom is always zero.. */ - if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { - memset(skb->data+skb->len, 0, pad); - return 0; - } - - ntail = skb->data_len + pad - (skb->end - skb->tail); - if (likely(skb_cloned(skb) || ntail > 0)) { - err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); - if (unlikely(err)) - goto free_skb; - } - - /* FIXME: The use of this function with non-linear skb's really needs - * to be audited. - */ - err = skb_linearize(skb); - if (unlikely(err)) - goto free_skb; - - memset(skb->data + skb->len, 0, pad); - return 0; - -free_skb: - kfree_skb(skb); - return err; -} - -/** - * skb_put - add data to a buffer - * @skb: buffer to use - * @len: amount of data to add - * - * This function extends the used data area of the buffer. If this would - * exceed the total buffer size the kernel will panic. A pointer to the - * first byte of the extra data is returned. - */ -unsigned char *skb_put(struct sk_buff *skb, unsigned int len) -{ - unsigned char *tmp = skb_tail_pointer(skb); - SKB_LINEAR_ASSERT(skb); - skb->tail += len; - skb->len += len; - if (unlikely(skb->tail > skb->end)) - skb_over_panic(skb, len, __builtin_return_address(0)); - return tmp; -} -EXPORT_SYMBOL(skb_put); - -/** - * skb_push - add data to the start of a buffer - * @skb: buffer to use - * @len: amount of data to add - * - * This function extends the used data area of the buffer at the buffer - * start. If this would exceed the total buffer headroom the kernel will - * panic. A pointer to the first byte of the extra data is returned. - */ -unsigned char *skb_push(struct sk_buff *skb, unsigned int len) -{ - skb->data -= len; - skb->len += len; - if (unlikely(skb->data<skb->head)) - skb_under_panic(skb, len, __builtin_return_address(0)); - return skb->data; -} -EXPORT_SYMBOL(skb_push); - -/** - * skb_pull - remove data from the start of a buffer - * @skb: buffer to use - * @len: amount of data to remove - * - * This function removes data from the start of a buffer, returning - * the memory to the headroom. A pointer to the next data in the buffer - * is returned. Once the data has been pulled future pushes will overwrite - * the old data. - */ -unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) -{ - return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); -} -EXPORT_SYMBOL(skb_pull); - -/** - * skb_trim - remove end from a buffer - * @skb: buffer to alter - * @len: new length - * - * Cut the length of a buffer down by removing data from the tail. If - * the buffer is already under the length specified it is not modified. - * The skb must be linear. - */ -void skb_trim(struct sk_buff *skb, unsigned int len) -{ - if (skb->len > len) - __skb_trim(skb, len); -} -EXPORT_SYMBOL(skb_trim); - -/* Trims skb to length len. It can change skb pointers. - */ - -int ___pskb_trim(struct sk_buff *skb, unsigned int len) -{ - struct sk_buff **fragp; - struct sk_buff *frag; - int offset = skb_headlen(skb); - int nfrags = skb_shinfo(skb)->nr_frags; - int i; - int err; - - if (skb_cloned(skb) && - unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) - return err; - - i = 0; - if (offset >= len) - goto drop_pages; - - for (; i < nfrags; i++) { - int end = offset + skb_shinfo(skb)->frags[i].size; - - if (end < len) { - offset = end; - continue; - } - - skb_shinfo(skb)->frags[i++].size = len - offset; - -drop_pages: - skb_shinfo(skb)->nr_frags = i; - - for (; i < nfrags; i++) - put_page(skb_shinfo(skb)->frags[i].page); - - if (skb_shinfo(skb)->frag_list) - skb_drop_fraglist(skb); - goto done; - } - - for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); - fragp = &frag->next) { - int end = offset + frag->len; - - if (skb_shared(frag)) { - struct sk_buff *nfrag; - - nfrag = skb_clone(frag, GFP_ATOMIC); - if (unlikely(!nfrag)) - return -ENOMEM; - - nfrag->next = frag->next; - kfree_skb(frag); - frag = nfrag; - *fragp = frag; - } - - if (end < len) { - offset = end; - continue; - } - - if (end > len && - unlikely((err = pskb_trim(frag, len - offset)))) - return err; - - if (frag->next) - skb_drop_list(&frag->next); - break; - } - -done: - if (len > skb_headlen(skb)) { - skb->data_len -= skb->len - len; - skb->len = len; - } else { - skb->len = len; - skb->data_len = 0; - skb_set_tail_pointer(skb, len); - } - - return 0; -} - -/** - * __pskb_pull_tail - advance tail of skb header - * @skb: buffer to reallocate - * @delta: number of bytes to advance tail - * - * The function makes a sense only on a fragmented &sk_buff, - * it expands header moving its tail forward and copying necessary - * data from fragmented part. - * - * &sk_buff MUST have reference count of 1. - * - * Returns %NULL (and &sk_buff does not change) if pull failed - * or value of new tail of skb in the case of success. - * - * All the pointers pointing into skb header may change and must be - * reloaded after call to this function. - */ - -/* Moves tail of skb head forward, copying data from fragmented part, - * when it is necessary. - * 1. It may fail due to malloc failure. - * 2. It may change skb pointers. - * - * It is pretty complicated. Luckily, it is called only in exceptional cases. - */ -unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) -{ - /* If skb has not enough free space at tail, get new one - * plus 128 bytes for future expansions. If we have enough - * room at tail, reallocate without expansion only if skb is cloned. - */ - int i, k, eat = (skb->tail + delta) - skb->end; - - if (eat > 0 || skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, - GFP_ATOMIC)) - return NULL; - } - - if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) - BUG(); - - /* Optimization: no fragments, no reasons to preestimate - * size of pulled pages. Superb. - */ - if (!skb_shinfo(skb)->frag_list) - goto pull_pages; - - /* Estimate size of pulled pages. */ - eat = delta; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size >= eat) - goto pull_pages; - eat -= skb_shinfo(skb)->frags[i].size; - } - - /* If we need update frag list, we are in troubles. - * Certainly, it possible to add an offset to skb data, - * but taking into account that pulling is expected to - * be very rare operation, it is worth to fight against - * further bloating skb head and crucify ourselves here instead. - * Pure masohism, indeed. 8)8) - */ - if (eat) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - struct sk_buff *clone = NULL; - struct sk_buff *insp = NULL; - - do { - BUG_ON(!list); - - if (list->len <= eat) { - /* Eaten as whole. */ - eat -= list->len; - list = list->next; - insp = list; - } else { - /* Eaten partially. */ - - if (skb_shared(list)) { - /* Sucks! We need to fork list. :-( */ - clone = skb_clone(list, GFP_ATOMIC); - if (!clone) - return NULL; - insp = list->next; - list = clone; - } else { - /* This may be pulled without - * problems. */ - insp = list; - } - if (!pskb_pull(list, eat)) { - if (clone) - kfree_skb(clone); - return NULL; - } - break; - } - } while (eat); - - /* Free pulled out fragments. */ - while ((list = skb_shinfo(skb)->frag_list) != insp) { - skb_shinfo(skb)->frag_list = list->next; - kfree_skb(list); - } - /* And insert new clone at head. */ - if (clone) { - clone->next = list; - skb_shinfo(skb)->frag_list = clone; - } - } - /* Success! Now we may commit changes to skb data. */ - -pull_pages: - eat = delta; - k = 0; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size <= eat) { - put_page(skb_shinfo(skb)->frags[i].page); - eat -= skb_shinfo(skb)->frags[i].size; - } else { - skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; - if (eat) { - skb_shinfo(skb)->frags[k].page_offset += eat; - skb_shinfo(skb)->frags[k].size -= eat; - eat = 0; - } - k++; - } - } - skb_shinfo(skb)->nr_frags = k; - - skb->tail += delta; - skb->data_len -= delta; - - return skb_tail_pointer(skb); -} - -/* Copy some data bits from skb to kernel buffer. */ - -int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) -{ - int i, copy; - int start = skb_headlen(skb); - - if (offset > (int)skb->len - len) - goto fault; - - /* Copy header. */ - if ((copy = start - offset) > 0) { - if (copy > len) - copy = len; - skb_copy_from_linear_data_offset(skb, offset, to, copy); - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - - WARN_ON(start > offset + len); - - end = start + skb_shinfo(skb)->frags[i].size; - if ((copy = end - offset) > 0) { - u8 *vaddr; - - if (copy > len) - copy = len; - - vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); - memcpy(to, - vaddr + skb_shinfo(skb)->frags[i].page_offset+ - offset - start, copy); - kunmap_skb_frag(vaddr); - - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_bits(list, offset - start, - to, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - } - start = end; - } - } - if (!len) - return 0; - -fault: - return -EFAULT; -} - -/* - * Callback from splice_to_pipe(), if we need to release some pages - * at the end of the spd in case we error'ed out in filling the pipe. - */ -static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) -{ - put_page(spd->pages[i]); -} - -static inline struct page *linear_to_page(struct page *page, unsigned int len, - unsigned int offset) -{ - struct page *p = alloc_pages(GFP_KERNEL, 0); - - if (!p) - return NULL; - memcpy(page_address(p) + offset, page_address(page) + offset, len); - - return p; -} - -/* - * Fill page/offset/length into spd, if it can hold more pages. - */ -static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, - unsigned int len, unsigned int offset, - struct sk_buff *skb, int linear) -{ - if (unlikely(spd->nr_pages == PIPE_BUFFERS)) - return 1; - - if (linear) { - page = linear_to_page(page, len, offset); - if (!page) - return 1; - } else - get_page(page); - - spd->pages[spd->nr_pages] = page; - spd->partial[spd->nr_pages].len = len; - spd->partial[spd->nr_pages].offset = offset; - spd->nr_pages++; - - return 0; -} - -static inline void __segment_seek(struct page **page, unsigned int *poff, - unsigned int *plen, unsigned int off) -{ - *poff += off; - *page += *poff / PAGE_SIZE; - *poff = *poff % PAGE_SIZE; - *plen -= off; -} - -static inline int __splice_segment(struct page *page, unsigned int poff, - unsigned int plen, unsigned int *off, - unsigned int *len, struct sk_buff *skb, - struct splice_pipe_desc *spd, int linear) -{ - if (!*len) - return 1; - - /* skip this segment if already processed */ - if (*off >= plen) { - *off -= plen; - return 0; - } - - /* ignore any bits we already processed */ - if (*off) { - __segment_seek(&page, &poff, &plen, *off); - *off = 0; - } - - do { - unsigned int flen = min(*len, plen); - - /* the linear region may spread across several pages */ - flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - - if (spd_fill_page(spd, page, flen, poff, skb, linear)) - return 1; - - __segment_seek(&page, &poff, &plen, flen); - *len -= flen; - - } while (*len && plen); - - return 0; -} - -/* - * Map linear and fragment data from the skb to spd. It reports failure if the - * pipe is full or if we already spliced the requested length. - */ -static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, - unsigned int *len, - struct splice_pipe_desc *spd) -{ - int seg; - - /* - * map the linear part - */ - if (__splice_segment(virt_to_page(skb->data), - (unsigned long) skb->data & (PAGE_SIZE - 1), - skb_headlen(skb), - offset, len, skb, spd, 1)) - return 1; - - /* - * then map the fragments - */ - for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { - const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; - - if (__splice_segment(f->page, f->page_offset, f->size, - offset, len, skb, spd, 0)) - return 1; - } - - return 0; -} - -/* - * Map data from the skb to a pipe. Should handle both the linear part, - * the fragments, and the frag list. It does NOT handle frag lists within - * the frag list, if such a thing exists. We'd probably need to recurse to - * handle that cleanly. - */ -int skb_splice_bits(struct sk_buff *skb, unsigned int offset, - struct pipe_inode_info *pipe, unsigned int tlen, - unsigned int flags) -{ - struct partial_page partial[PIPE_BUFFERS]; - struct page *pages[PIPE_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .flags = flags, - .ops = &sock_pipe_buf_ops, - .spd_release = sock_spd_release, - }; - - /* - * __skb_splice_bits() only fails if the output has no room left, - * so no point in going over the frag_list for the error case. - */ - if (__skb_splice_bits(skb, &offset, &tlen, &spd)) - goto done; - else if (!tlen) - goto done; - - /* - * now see if we have a frag_list to map - */ - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list && tlen; list = list->next) { - if (__skb_splice_bits(list, &offset, &tlen, &spd)) - break; - } - } - -done: - if (spd.nr_pages) { - struct sock *sk = skb->sk; - int ret; - - /* - * Drop the socket lock, otherwise we have reverse - * locking dependencies between sk_lock and i_mutex - * here as compared to sendfile(). We enter here - * with the socket lock held, and splice_to_pipe() will - * grab the pipe inode lock. For sendfile() emulation, - * we call into ->sendpage() with the i_mutex lock held - * and networking will grab the socket lock. - */ - release_sock(sk); - ret = splice_to_pipe(pipe, &spd); - lock_sock(sk); - return ret; - } - - return 0; -} - -/** - * skb_store_bits - store bits from kernel buffer to skb - * @skb: destination buffer - * @offset: offset in destination - * @from: source buffer - * @len: number of bytes to copy - * - * Copy the specified number of bytes from the source buffer to the - * destination skb. This function handles all the messy bits of - * traversing fragment lists and such. - */ - -int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) -{ - int i, copy; - int start = skb_headlen(skb); - - if (offset > (int)skb->len - len) - goto fault; - - if ((copy = start - offset) > 0) { - if (copy > len) - copy = len; - skb_copy_to_linear_data_offset(skb, offset, from, copy); - if ((len -= copy) == 0) - return 0; - offset += copy; - from += copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - int end; - - WARN_ON(start > offset + len); - - end = start + frag->size; - if ((copy = end - offset) > 0) { - u8 *vaddr; - - if (copy > len) - copy = len; - - vaddr = kmap_skb_frag(frag); - memcpy(vaddr + frag->page_offset + offset - start, - from, copy); - kunmap_skb_frag(vaddr); - - if ((len -= copy) == 0) - return 0; - offset += copy; - from += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_store_bits(list, offset - start, - from, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - from += copy; - } - start = end; - } - } - if (!len) - return 0; - -fault: - return -EFAULT; -} - -EXPORT_SYMBOL(skb_store_bits); - -/* Checksum skb data. */ - -__wsum skb_checksum(const struct sk_buff *skb, int offset, - int len, __wsum csum) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - int pos = 0; - - /* Checksum header. */ - if (copy > 0) { - if (copy > len) - copy = len; - csum = csum_partial(skb->data + offset, copy, csum); - if ((len -= copy) == 0) - return csum; - offset += copy; - pos = copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - - WARN_ON(start > offset + len); - - end = start + skb_shinfo(skb)->frags[i].size; - if ((copy = end - offset) > 0) { - __wsum csum2; - u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - if (copy > len) - copy = len; - vaddr = kmap_skb_frag(frag); - csum2 = csum_partial(vaddr + frag->page_offset + - offset - start, copy, 0); - kunmap_skb_frag(vaddr); - csum = csum_block_add(csum, csum2, pos); - if (!(len -= copy)) - return csum; - offset += copy; - pos += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - __wsum csum2; - if (copy > len) - copy = len; - csum2 = skb_checksum(list, offset - start, - copy, 0); - csum = csum_block_add(csum, csum2, pos); - if ((len -= copy) == 0) - return csum; - offset += copy; - pos += copy; - } - start = end; - } - } - BUG_ON(len); - - return csum; -} - -/* Both of above in one bottle. */ - -__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, - u8 *to, int len, __wsum csum) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - int pos = 0; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - csum = csum_partial_copy_nocheck(skb->data + offset, to, - copy, csum); - if ((len -= copy) == 0) - return csum; - offset += copy; - to += copy; - pos = copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - - WARN_ON(start > offset + len); - - end = start + skb_shinfo(skb)->frags[i].size; - if ((copy = end - offset) > 0) { - __wsum csum2; - u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - if (copy > len) - copy = len; - vaddr = kmap_skb_frag(frag); - csum2 = csum_partial_copy_nocheck(vaddr + - frag->page_offset + - offset - start, to, - copy, 0); - kunmap_skb_frag(vaddr); - csum = csum_block_add(csum, csum2, pos); - if (!(len -= copy)) - return csum; - offset += copy; - to += copy; - pos += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - __wsum csum2; - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - csum2 = skb_copy_and_csum_bits(list, - offset - start, - to, copy, 0); - csum = csum_block_add(csum, csum2, pos); - if ((len -= copy) == 0) - return csum; - offset += copy; - to += copy; - pos += copy; - } - start = end; - } - } - BUG_ON(len); - return csum; -} - -void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) -{ - __wsum csum; - long csstart; - - if (skb->ip_summed == CHECKSUM_PARTIAL) - csstart = skb->csum_start - skb_headroom(skb); - else - csstart = skb_headlen(skb); - - BUG_ON(csstart > skb_headlen(skb)); - - skb_copy_from_linear_data(skb, to, csstart); - - csum = 0; - if (csstart != skb->len) - csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, - skb->len - csstart, 0); - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - long csstuff = csstart + skb->csum_offset; - - *((__sum16 *)(to + csstuff)) = csum_fold(csum); - } -} - -/** - * skb_dequeue - remove from the head of the queue - * @list: list to dequeue from - * - * Remove the head of the list. The list lock is taken so the function - * may be used safely with other locking list functions. The head item is - * returned or %NULL if the list is empty. - */ - -struct sk_buff *skb_dequeue(struct sk_buff_head *list) -{ - unsigned long flags; - struct sk_buff *result; - - spin_lock_irqsave(&list->lock, flags); - result = __skb_dequeue(list); - spin_unlock_irqrestore(&list->lock, flags); - return result; -} - -/** - * skb_dequeue_tail - remove from the tail of the queue - * @list: list to dequeue from - * - * Remove the tail of the list. The list lock is taken so the function - * may be used safely with other locking list functions. The tail item is - * returned or %NULL if the list is empty. - */ -struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) -{ - unsigned long flags; - struct sk_buff *result; - - spin_lock_irqsave(&list->lock, flags); - result = __skb_dequeue_tail(list); - spin_unlock_irqrestore(&list->lock, flags); - return result; -} - -/** - * skb_queue_purge - empty a list - * @list: list to empty - * - * Delete all buffers on an &sk_buff list. Each buffer is removed from - * the list and one reference dropped. This function takes the list - * lock and is atomic with respect to other list locking functions. - */ -void skb_queue_purge(struct sk_buff_head *list) -{ - struct sk_buff *skb; - while ((skb = skb_dequeue(list)) != NULL) - kfree_skb(skb); -} - -/** - * skb_queue_head - queue a buffer at the list head - * @list: list to use - * @newsk: buffer to queue - * - * Queue a buffer at the start of the list. This function takes the - * list lock and can be used safely with other locking &sk_buff functions - * safely. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_queue_head(list, newsk); - spin_unlock_irqrestore(&list->lock, flags); -} - -/** - * skb_queue_tail - queue a buffer at the list tail - * @list: list to use - * @newsk: buffer to queue - * - * Queue a buffer at the tail of the list. This function takes the - * list lock and can be used safely with other locking &sk_buff functions - * safely. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_queue_tail(list, newsk); - spin_unlock_irqrestore(&list->lock, flags); -} - -/** - * skb_unlink - remove a buffer from a list - * @skb: buffer to remove - * @list: list to use - * - * Remove a packet from a list. The list locks are taken and this - * function is atomic with respect to other list locked calls - * - * You must know what list the SKB is on. - */ -void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_unlink(skb, list); - spin_unlock_irqrestore(&list->lock, flags); -} - -/** - * skb_append - append a buffer - * @old: buffer to insert after - * @newsk: buffer to insert - * @list: list to use - * - * Place a packet after a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls. - * A buffer cannot be placed on two lists at the same time. - */ -void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_queue_after(list, old, newsk); - spin_unlock_irqrestore(&list->lock, flags); -} - - -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * @list: list to use - * - * Place a packet before a given packet in a list. The list locks are - * taken and this function is atomic with respect to other list locked - * calls. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_insert(newsk, old->prev, old, list); - spin_unlock_irqrestore(&list->lock, flags); -} - -static inline void skb_split_inside_header(struct sk_buff *skb, - struct sk_buff* skb1, - const u32 len, const int pos) -{ - int i; - - skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), - pos - len); - /* And move data appendix as is. */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; - - skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; - skb_shinfo(skb)->nr_frags = 0; - skb1->data_len = skb->data_len; - skb1->len += skb1->data_len; - skb->data_len = 0; - skb->len = len; - skb_set_tail_pointer(skb, len); -} - -static inline void skb_split_no_header(struct sk_buff *skb, - struct sk_buff* skb1, - const u32 len, int pos) -{ - int i, k = 0; - const int nfrags = skb_shinfo(skb)->nr_frags; - - skb_shinfo(skb)->nr_frags = 0; - skb1->len = skb1->data_len = skb->len - len; - skb->len = len; - skb->data_len = len - pos; - - for (i = 0; i < nfrags; i++) { - int size = skb_shinfo(skb)->frags[i].size; - - if (pos + size > len) { - skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; - - if (pos < len) { - /* Split frag. - * We have two variants in this case: - * 1. Move all the frag to the second - * part, if it is possible. F.e. - * this approach is mandatory for TUX, - * where splitting is expensive. - * 2. Split is accurately. We make this. - */ - get_page(skb_shinfo(skb)->frags[i].page); - skb_shinfo(skb1)->frags[0].page_offset += len - pos; - skb_shinfo(skb1)->frags[0].size -= len - pos; - skb_shinfo(skb)->frags[i].size = len - pos; - skb_shinfo(skb)->nr_frags++; - } - k++; - } else - skb_shinfo(skb)->nr_frags++; - pos += size; - } - skb_shinfo(skb1)->nr_frags = k; -} - -/** - * skb_split - Split fragmented skb to two parts at length len. - * @skb: the buffer to split - * @skb1: the buffer to receive the second part - * @len: new length for skb - */ -void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) -{ - int pos = skb_headlen(skb); - - if (len < pos) /* Split line is inside header. */ - skb_split_inside_header(skb, skb1, len, pos); - else /* Second chunk has no header, nothing to copy. */ - skb_split_no_header(skb, skb1, len, pos); -} - -/* Shifting from/to a cloned skb is a no-go. - * - * Caller cannot keep skb_shinfo related pointers past calling here! - */ -static int skb_prepare_for_shift(struct sk_buff *skb) -{ - return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); -} - -/** - * skb_shift - Shifts paged data partially from skb to another - * @tgt: buffer into which tail data gets added - * @skb: buffer from which the paged data comes from - * @shiftlen: shift up to this many bytes - * - * Attempts to shift up to shiftlen worth of bytes, which may be less than - * the length of the skb, from tgt to skb. Returns number bytes shifted. - * It's up to caller to free skb if everything was shifted. - * - * If @tgt runs out of frags, the whole operation is aborted. - * - * Skb cannot include anything else but paged data while tgt is allowed - * to have non-paged data as well. - * - * TODO: full sized shift could be optimized but that would need - * specialized skb free'er to handle frags without up-to-date nr_frags. - */ -int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) -{ - int from, to, merge, todo; - struct skb_frag_struct *fragfrom, *fragto; - - BUG_ON(shiftlen > skb->len); - BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ - - todo = shiftlen; - from = 0; - to = skb_shinfo(tgt)->nr_frags; - fragfrom = &skb_shinfo(skb)->frags[from]; - - /* Actual merge is delayed until the point when we know we can - * commit all, so that we don't have to undo partial changes - */ - if (!to || - !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) { - merge = -1; - } else { - merge = to - 1; - - todo -= fragfrom->size; - if (todo < 0) { - if (skb_prepare_for_shift(skb) || - skb_prepare_for_shift(tgt)) - return 0; - - /* All previous frag pointers might be stale! */ - fragfrom = &skb_shinfo(skb)->frags[from]; - fragto = &skb_shinfo(tgt)->frags[merge]; - - fragto->size += shiftlen; - fragfrom->size -= shiftlen; - fragfrom->page_offset += shiftlen; - - goto onlymerged; - } - - from++; - } - - /* Skip full, not-fitting skb to avoid expensive operations */ - if ((shiftlen == skb->len) && - (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) - return 0; - - if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) - return 0; - - while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { - if (to == MAX_SKB_FRAGS) - return 0; - - fragfrom = &skb_shinfo(skb)->frags[from]; - fragto = &skb_shinfo(tgt)->frags[to]; - - if (todo >= fragfrom->size) { - *fragto = *fragfrom; - todo -= fragfrom->size; - from++; - to++; - - } else { - get_page(fragfrom->page); - fragto->page = fragfrom->page; - fragto->page_offset = fragfrom->page_offset; - fragto->size = todo; - - fragfrom->page_offset += todo; - fragfrom->size -= todo; - todo = 0; - - to++; - break; - } - } - - /* Ready to "commit" this state change to tgt */ - skb_shinfo(tgt)->nr_frags = to; - - if (merge >= 0) { - fragfrom = &skb_shinfo(skb)->frags[0]; - fragto = &skb_shinfo(tgt)->frags[merge]; - - fragto->size += fragfrom->size; - put_page(fragfrom->page); - } - - /* Reposition in the original skb */ - to = 0; - while (from < skb_shinfo(skb)->nr_frags) - skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; - skb_shinfo(skb)->nr_frags = to; - - BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); - -onlymerged: - /* Most likely the tgt won't ever need its checksum anymore, skb on - * the other hand might need it if it needs to be resent - */ - tgt->ip_summed = CHECKSUM_PARTIAL; - skb->ip_summed = CHECKSUM_PARTIAL; - - /* Yak, is it really working this way? Some helper please? */ - skb->len -= shiftlen; - skb->data_len -= shiftlen; - skb->truesize -= shiftlen; - tgt->len += shiftlen; - tgt->data_len += shiftlen; - tgt->truesize += shiftlen; - - return shiftlen; -} - -/** - * skb_prepare_seq_read - Prepare a sequential read of skb data - * @skb: the buffer to read - * @from: lower offset of data to be read - * @to: upper offset of data to be read - * @st: state variable - * - * Initializes the specified state variable. Must be called before - * invoking skb_seq_read() for the first time. - */ -void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, - unsigned int to, struct skb_seq_state *st) -{ - st->lower_offset = from; - st->upper_offset = to; - st->root_skb = st->cur_skb = skb; - st->frag_idx = st->stepped_offset = 0; - st->frag_data = NULL; -} - -/** - * skb_seq_read - Sequentially read skb data - * @consumed: number of bytes consumed by the caller so far - * @data: destination pointer for data to be returned - * @st: state variable - * - * Reads a block of skb data at &consumed relative to the - * lower offset specified to skb_prepare_seq_read(). Assigns - * the head of the data block to &data and returns the length - * of the block or 0 if the end of the skb data or the upper - * offset has been reached. - * - * The caller is not required to consume all of the data - * returned, i.e. &consumed is typically set to the number - * of bytes already consumed and the next call to - * skb_seq_read() will return the remaining part of the block. - * - * Note 1: The size of each block of data returned can be arbitary, - * this limitation is the cost for zerocopy seqeuental - * reads of potentially non linear data. - * - * Note 2: Fragment lists within fragments are not implemented - * at the moment, state->root_skb could be replaced with - * a stack for this purpose. - */ -unsigned int skb_seq_read(unsigned int consumed, const u8 **data, - struct skb_seq_state *st) -{ - unsigned int block_limit, abs_offset = consumed + st->lower_offset; - skb_frag_t *frag; - - if (unlikely(abs_offset >= st->upper_offset)) - return 0; - -next_skb: - block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; - - if (abs_offset < block_limit) { - *data = st->cur_skb->data + (abs_offset - st->stepped_offset); - return block_limit - abs_offset; - } - - if (st->frag_idx == 0 && !st->frag_data) - st->stepped_offset += skb_headlen(st->cur_skb); - - while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { - frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; - block_limit = frag->size + st->stepped_offset; - - if (abs_offset < block_limit) { - if (!st->frag_data) - st->frag_data = kmap_skb_frag(frag); - - *data = (u8 *) st->frag_data + frag->page_offset + - (abs_offset - st->stepped_offset); - - return block_limit - abs_offset; - } - - if (st->frag_data) { - kunmap_skb_frag(st->frag_data); - st->frag_data = NULL; - } - - st->frag_idx++; - st->stepped_offset += frag->size; - } - - if (st->frag_data) { - kunmap_skb_frag(st->frag_data); - st->frag_data = NULL; - } - - if (st->root_skb == st->cur_skb && - skb_shinfo(st->root_skb)->frag_list) { - st->cur_skb = skb_shinfo(st->root_skb)->frag_list; - st->frag_idx = 0; - goto next_skb; - } else if (st->cur_skb->next) { - st->cur_skb = st->cur_skb->next; - st->frag_idx = 0; - goto next_skb; - } - - return 0; -} - -/** - * skb_abort_seq_read - Abort a sequential read of skb data - * @st: state variable - * - * Must be called if skb_seq_read() was not called until it - * returned 0. - */ -void skb_abort_seq_read(struct skb_seq_state *st) -{ - if (st->frag_data) - kunmap_skb_frag(st->frag_data); -} - -#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) - -static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, - struct ts_config *conf, - struct ts_state *state) -{ - return skb_seq_read(offset, text, TS_SKB_CB(state)); -} - -static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) -{ - skb_abort_seq_read(TS_SKB_CB(state)); -} - -/** - * skb_find_text - Find a text pattern in skb data - * @skb: the buffer to look in - * @from: search offset - * @to: search limit - * @config: textsearch configuration - * @state: uninitialized textsearch state variable - * - * Finds a pattern in the skb data according to the specified - * textsearch configuration. Use textsearch_next() to retrieve - * subsequent occurrences of the pattern. Returns the offset - * to the first occurrence or UINT_MAX if no match was found. - */ -unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, - unsigned int to, struct ts_config *config, - struct ts_state *state) -{ - unsigned int ret; - - config->get_next_block = skb_ts_get_next_block; - config->finish = skb_ts_finish; - - skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); - - ret = textsearch_find(config, state); - return (ret <= to - from ? ret : UINT_MAX); -} - -/** - * skb_append_datato_frags: - append the user data to a skb - * @sk: sock structure - * @skb: skb structure to be appened with user data. - * @getfrag: call back function to be used for getting the user data - * @from: pointer to user message iov - * @length: length of the iov message - * - * Description: This procedure append the user data in the fragment part - * of the skb if any page alloc fails user this procedure returns -ENOMEM - */ -int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, - int (*getfrag)(void *from, char *to, int offset, - int len, int odd, struct sk_buff *skb), - void *from, int length) -{ - int frg_cnt = 0; - skb_frag_t *frag = NULL; - struct page *page = NULL; - int copy, left; - int offset = 0; - int ret; - - do { - /* Return error if we don't have space for new frag */ - frg_cnt = skb_shinfo(skb)->nr_frags; - if (frg_cnt >= MAX_SKB_FRAGS) - return -EFAULT; - - /* allocate a new page for next frag */ - page = alloc_pages(sk->sk_allocation, 0); - - /* If alloc_page fails just return failure and caller will - * free previous allocated pages by doing kfree_skb() - */ - if (page == NULL) - return -ENOMEM; - - /* initialize the next frag */ - sk->sk_sndmsg_page = page; - sk->sk_sndmsg_off = 0; - skb_fill_page_desc(skb, frg_cnt, page, 0, 0); - skb->truesize += PAGE_SIZE; - atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); - - /* get the new initialized frag */ - frg_cnt = skb_shinfo(skb)->nr_frags; - frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; - - /* copy the user data to page */ - left = PAGE_SIZE - frag->page_offset; - copy = (length > left)? left : length; - - ret = getfrag(from, (page_address(frag->page) + - frag->page_offset + frag->size), - offset, copy, 0, skb); - if (ret < 0) - return -EFAULT; - - /* copy was successful so update the size parameters */ - sk->sk_sndmsg_off += copy; - frag->size += copy; - skb->len += copy; - skb->data_len += copy; - offset += copy; - length -= copy; - - } while (length > 0); - - return 0; -} - -/** - * skb_pull_rcsum - pull skb and update receive checksum - * @skb: buffer to update - * @len: length of data pulled - * - * This function performs an skb_pull on the packet and updates - * the CHECKSUM_COMPLETE checksum. It should be used on - * receive path processing instead of skb_pull unless you know - * that the checksum difference is zero (e.g., a valid IP header) - * or you are setting ip_summed to CHECKSUM_NONE. - */ -unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) -{ - BUG_ON(len > skb->len); - skb->len -= len; - BUG_ON(skb->len < skb->data_len); - skb_postpull_rcsum(skb, skb->data, len); - return skb->data += len; -} - -EXPORT_SYMBOL_GPL(skb_pull_rcsum); - -/** - * skb_segment - Perform protocol segmentation on skb. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * - * This function performs segmentation on the given skb. It returns - * a pointer to the first in a list of new skbs for the segments. - * In case of error it returns ERR_PTR(err). - */ -struct sk_buff *skb_segment(struct sk_buff *skb, int features) -{ - struct sk_buff *segs = NULL; - struct sk_buff *tail = NULL; - struct sk_buff *fskb = skb_shinfo(skb)->frag_list; - unsigned int mss = skb_shinfo(skb)->gso_size; - unsigned int doffset = skb->data - skb_mac_header(skb); - unsigned int offset = doffset; - unsigned int headroom; - unsigned int len; - int sg = features & NETIF_F_SG; - int nfrags = skb_shinfo(skb)->nr_frags; - int err = -ENOMEM; - int i = 0; - int pos; - - __skb_push(skb, doffset); - headroom = skb_headroom(skb); - pos = skb_headlen(skb); - - do { - struct sk_buff *nskb; - skb_frag_t *frag; - int hsize; - int size; - - len = skb->len - offset; - if (len > mss) - len = mss; - - hsize = skb_headlen(skb) - offset; - if (hsize < 0) - hsize = 0; - if (hsize > len || !sg) - hsize = len; - - if (!hsize && i >= nfrags) { - BUG_ON(fskb->len != len); - - pos += len; - nskb = skb_clone(fskb, GFP_ATOMIC); - fskb = fskb->next; - - if (unlikely(!nskb)) - goto err; - - hsize = skb_end_pointer(nskb) - nskb->head; - if (skb_cow_head(nskb, doffset + headroom)) { - kfree_skb(nskb); - goto err; - } - - nskb->truesize += skb_end_pointer(nskb) - nskb->head - - hsize; - skb_release_head_state(nskb); - __skb_push(nskb, doffset); - } else { - nskb = alloc_skb(hsize + doffset + headroom, - GFP_ATOMIC); - - if (unlikely(!nskb)) - goto err; - - skb_reserve(nskb, headroom); - __skb_put(nskb, doffset); - } - - if (segs) - tail->next = nskb; - else - segs = nskb; - tail = nskb; - - __copy_skb_header(nskb, skb); - nskb->mac_len = skb->mac_len; - - skb_reset_mac_header(nskb); - skb_set_network_header(nskb, skb->mac_len); - nskb->transport_header = (nskb->network_header + - skb_network_header_len(skb)); - skb_copy_from_linear_data(skb, nskb->data, doffset); - - if (pos >= offset + len) - continue; - - if (!sg) { - nskb->ip_summed = CHECKSUM_NONE; - nskb->csum = skb_copy_and_csum_bits(skb, offset, - skb_put(nskb, len), - len, 0); - continue; - } - - frag = skb_shinfo(nskb)->frags; - - skb_copy_from_linear_data_offset(skb, offset, - skb_put(nskb, hsize), hsize); - - while (pos < offset + len && i < nfrags) { - *frag = skb_shinfo(skb)->frags[i]; - get_page(frag->page); - size = frag->size; - - if (pos < offset) { - frag->page_offset += offset - pos; - frag->size -= offset - pos; - } - - skb_shinfo(nskb)->nr_frags++; - - if (pos + size <= offset + len) { - i++; - pos += size; - } else { - frag->size -= pos + size - (offset + len); - goto skip_fraglist; - } - - frag++; - } - - if (pos < offset + len) { - struct sk_buff *fskb2 = fskb; - - BUG_ON(pos + fskb->len != offset + len); - - pos += fskb->len; - fskb = fskb->next; - - if (fskb2->next) { - fskb2 = skb_clone(fskb2, GFP_ATOMIC); - if (!fskb2) - goto err; - } else - skb_get(fskb2); - - BUG_ON(skb_shinfo(nskb)->frag_list); - skb_shinfo(nskb)->frag_list = fskb2; - } - -skip_fraglist: - nskb->data_len = len - hsize; - nskb->len += nskb->data_len; - nskb->truesize += nskb->data_len; - } while ((offset += len) < skb->len); - - return segs; - -err: - while ((skb = segs)) { - segs = skb->next; - kfree_skb(skb); - } - return ERR_PTR(err); -} - -EXPORT_SYMBOL_GPL(skb_segment); - -int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) -{ - struct sk_buff *p = *head; - struct sk_buff *nskb; - unsigned int headroom; - unsigned int hlen = p->data - skb_mac_header(p); - unsigned int len = skb->len; - - if (hlen + p->len + len >= 65536) - return -E2BIG; - - if (skb_shinfo(p)->frag_list) - goto merge; - else if (!skb_headlen(p) && !skb_headlen(skb) && - skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags < - MAX_SKB_FRAGS) { - memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags, - skb_shinfo(skb)->frags, - skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); - - skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags; - skb_shinfo(skb)->nr_frags = 0; - - skb->truesize -= skb->data_len; - skb->len -= skb->data_len; - skb->data_len = 0; - - NAPI_GRO_CB(skb)->free = 1; - goto done; - } - - headroom = skb_headroom(p); - nskb = netdev_alloc_skb(p->dev, headroom); - if (unlikely(!nskb)) - return -ENOMEM; - - __copy_skb_header(nskb, p); - nskb->mac_len = p->mac_len; - - skb_reserve(nskb, headroom); - - skb_set_mac_header(nskb, -hlen); - skb_set_network_header(nskb, skb_network_offset(p)); - skb_set_transport_header(nskb, skb_transport_offset(p)); - - memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen); - - *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); - skb_shinfo(nskb)->frag_list = p; - skb_shinfo(nskb)->gso_size = skb_shinfo(p)->gso_size; - skb_header_release(p); - nskb->prev = p; - - nskb->data_len += p->len; - nskb->truesize += p->len; - nskb->len += p->len; - - *head = nskb; - nskb->next = p->next; - p->next = NULL; - - p = nskb; - -merge: - p->prev->next = skb; - p->prev = skb; - skb_header_release(skb); - -done: - NAPI_GRO_CB(p)->count++; - p->data_len += len; - p->truesize += len; - p->len += len; - - NAPI_GRO_CB(skb)->same_flow = 1; - return 0; -} -EXPORT_SYMBOL_GPL(skb_gro_receive); - -void __init skb_init(void) -{ - skbuff_head_cache = kmem_cache_create("skbuff_head_cache", - sizeof(struct sk_buff), - 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); - skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", - (2*sizeof(struct sk_buff)) + - sizeof(atomic_t), - 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); -} - -/** - * skb_to_sgvec - Fill a scatter-gather list from a socket buffer - * @skb: Socket buffer containing the buffers to be mapped - * @sg: The scatter-gather list to map into - * @offset: The offset into the buffer's contents to start mapping - * @len: Length of buffer space to be mapped - * - * Fill the specified scatter-gather list with mappings/pointers into a - * region of the buffer space attached to a socket buffer. - */ -static int -__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - int elt = 0; - - if (copy > 0) { - if (copy > len) - copy = len; - sg_set_buf(sg, skb->data + offset, copy); - elt++; - if ((len -= copy) == 0) - return elt; - offset += copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - - WARN_ON(start > offset + len); - - end = start + skb_shinfo(skb)->frags[i].size; - if ((copy = end - offset) > 0) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - if (copy > len) - copy = len; - sg_set_page(&sg[elt], frag->page, copy, - frag->page_offset+offset-start); - elt++; - if (!(len -= copy)) - return elt; - offset += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - elt += __skb_to_sgvec(list, sg+elt, offset - start, - copy); - if ((len -= copy) == 0) - return elt; - offset += copy; - } - start = end; - } - } - BUG_ON(len); - return elt; -} - -int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) -{ - int nsg = __skb_to_sgvec(skb, sg, offset, len); - - sg_mark_end(&sg[nsg - 1]); - - return nsg; -} - -/** - * skb_cow_data - Check that a socket buffer's data buffers are writable - * @skb: The socket buffer to check. - * @tailbits: Amount of trailing space to be added - * @trailer: Returned pointer to the skb where the @tailbits space begins - * - * Make sure that the data buffers attached to a socket buffer are - * writable. If they are not, private copies are made of the data buffers - * and the socket buffer is set to use these instead. - * - * If @tailbits is given, make sure that there is space to write @tailbits - * bytes of data beyond current end of socket buffer. @trailer will be - * set to point to the skb in which this space begins. - * - * The number of scatterlist elements required to completely map the - * COW'd and extended socket buffer will be returned. - */ -int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) -{ - int copyflag; - int elt; - struct sk_buff *skb1, **skb_p; - - /* If skb is cloned or its head is paged, reallocate - * head pulling out all the pages (pages are considered not writable - * at the moment even if they are anonymous). - */ - if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && - __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) - return -ENOMEM; - - /* Easy case. Most of packets will go this way. */ - if (!skb_shinfo(skb)->frag_list) { - /* A little of trouble, not enough of space for trailer. - * This should not happen, when stack is tuned to generate - * good frames. OK, on miss we reallocate and reserve even more - * space, 128 bytes is fair. */ - - if (skb_tailroom(skb) < tailbits && - pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) - return -ENOMEM; - - /* Voila! */ - *trailer = skb; - return 1; - } - - /* Misery. We are in troubles, going to mincer fragments... */ - - elt = 1; - skb_p = &skb_shinfo(skb)->frag_list; - copyflag = 0; - - while ((skb1 = *skb_p) != NULL) { - int ntail = 0; - - /* The fragment is partially pulled by someone, - * this can happen on input. Copy it and everything - * after it. */ - - if (skb_shared(skb1)) - copyflag = 1; - - /* If the skb is the last, worry about trailer. */ - - if (skb1->next == NULL && tailbits) { - if (skb_shinfo(skb1)->nr_frags || - skb_shinfo(skb1)->frag_list || - skb_tailroom(skb1) < tailbits) - ntail = tailbits + 128; - } - - if (copyflag || - skb_cloned(skb1) || - ntail || - skb_shinfo(skb1)->nr_frags || - skb_shinfo(skb1)->frag_list) { - struct sk_buff *skb2; - - /* Fuck, we are miserable poor guys... */ - if (ntail == 0) - skb2 = skb_copy(skb1, GFP_ATOMIC); - else - skb2 = skb_copy_expand(skb1, - skb_headroom(skb1), - ntail, - GFP_ATOMIC); - if (unlikely(skb2 == NULL)) - return -ENOMEM; - - if (skb1->sk) - skb_set_owner_w(skb2, skb1->sk); - - /* Looking around. Are we still alive? - * OK, link new skb, drop old one */ - - skb2->next = skb1->next; - *skb_p = skb2; - kfree_skb(skb1); - skb1 = skb2; - } - elt++; - *trailer = skb1; - skb_p = &skb1->next; - } - - return elt; -} - -/** - * skb_partial_csum_set - set up and verify partial csum values for packet - * @skb: the skb to set - * @start: the number of bytes after skb->data to start checksumming. - * @off: the offset from start to place the checksum. - * - * For untrusted partially-checksummed packets, we need to make sure the values - * for skb->csum_start and skb->csum_offset are valid so we don't oops. - * - * This function checks and sets those values and skb->ip_summed: if this - * returns false you should drop the packet. - */ -bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) -{ - if (unlikely(start > skb->len - 2) || - unlikely((int)start + off > skb->len - 2)) { - if (net_ratelimit()) - printk(KERN_WARNING - "bad partial csum: csum=%u/%u len=%u\n", - start, off, skb->len); - return false; - } - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + start; - skb->csum_offset = off; - return true; -} - -void __skb_warn_lro_forwarding(const struct sk_buff *skb) -{ - if (net_ratelimit()) - pr_warning("%s: received packets cannot be forwarded" - " while LRO is enabled\n", skb->dev->name); -} - -EXPORT_SYMBOL(___pskb_trim); -EXPORT_SYMBOL(__kfree_skb); -EXPORT_SYMBOL(kfree_skb); -EXPORT_SYMBOL(__pskb_pull_tail); -EXPORT_SYMBOL(__alloc_skb); -EXPORT_SYMBOL(__netdev_alloc_skb); -EXPORT_SYMBOL(pskb_copy); -EXPORT_SYMBOL(pskb_expand_head); -EXPORT_SYMBOL(skb_checksum); -EXPORT_SYMBOL(skb_clone); -EXPORT_SYMBOL(skb_copy); -EXPORT_SYMBOL(skb_copy_and_csum_bits); -EXPORT_SYMBOL(skb_copy_and_csum_dev); -EXPORT_SYMBOL(skb_copy_bits); -EXPORT_SYMBOL(skb_copy_expand); -EXPORT_SYMBOL(skb_over_panic); -EXPORT_SYMBOL(skb_pad); -EXPORT_SYMBOL(skb_realloc_headroom); -EXPORT_SYMBOL(skb_under_panic); -EXPORT_SYMBOL(skb_dequeue); -EXPORT_SYMBOL(skb_dequeue_tail); -EXPORT_SYMBOL(skb_insert); -EXPORT_SYMBOL(skb_queue_purge); -EXPORT_SYMBOL(skb_queue_head); -EXPORT_SYMBOL(skb_queue_tail); -EXPORT_SYMBOL(skb_unlink); -EXPORT_SYMBOL(skb_append); -EXPORT_SYMBOL(skb_split); -EXPORT_SYMBOL(skb_prepare_seq_read); -EXPORT_SYMBOL(skb_seq_read); -EXPORT_SYMBOL(skb_abort_seq_read); -EXPORT_SYMBOL(skb_find_text); -EXPORT_SYMBOL(skb_append_datato_frags); -EXPORT_SYMBOL(__skb_warn_lro_forwarding); - -EXPORT_SYMBOL_GPL(skb_to_sgvec); -EXPORT_SYMBOL_GPL(skb_cow_data); -EXPORT_SYMBOL_GPL(skb_partial_csum_set); diff --git a/libdde_linux26/lib/src/net/core/.svn/text-base/utils.c.svn-base b/libdde_linux26/lib/src/net/core/.svn/text-base/utils.c.svn-base deleted file mode 100644 index 5d10a675..00000000 --- a/libdde_linux26/lib/src/net/core/.svn/text-base/utils.c.svn-base +++ /dev/null @@ -1,309 +0,0 @@ -/* - * Generic address resultion entity - * - * Authors: - * net_random Alan Cox - * net_ratelimit Andi Kleen - * in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project - * - * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/inet.h> -#include <linux/mm.h> -#include <linux/net.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/random.h> -#include <linux/percpu.h> -#include <linux/init.h> -#include <net/sock.h> - -#include <asm/byteorder.h> -#include <asm/system.h> -#include <asm/uaccess.h> - -#ifndef DDE_LINUX -int net_msg_cost __read_mostly = 5*HZ; -DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); -#else -int net_msg_cost = 500; -#endif /* DDE_LINUX */ -int net_msg_burst __read_mostly = 10; -int net_msg_warn __read_mostly = 1; -EXPORT_SYMBOL(net_msg_warn); - -/* - * All net warning printk()s should be guarded by this function. - */ -int net_ratelimit(void) -{ -#ifndef DDE_LINUX - return __ratelimit(&net_ratelimit_state); -#else - return 0; -#endif -} -EXPORT_SYMBOL(net_ratelimit); - -/* - * Convert an ASCII string to binary IP. - * This is outside of net/ipv4/ because various code that uses IP addresses - * is otherwise not dependent on the TCP/IP stack. - */ - -__be32 in_aton(const char *str) -{ - unsigned long l; - unsigned int val; - int i; - - l = 0; - for (i = 0; i < 4; i++) - { - l <<= 8; - if (*str != '\0') - { - val = 0; - while (*str != '\0' && *str != '.' && *str != '\n') - { - val *= 10; - val += *str - '0'; - str++; - } - l |= val; - if (*str != '\0') - str++; - } - } - return(htonl(l)); -} - -EXPORT_SYMBOL(in_aton); - -#define IN6PTON_XDIGIT 0x00010000 -#define IN6PTON_DIGIT 0x00020000 -#define IN6PTON_COLON_MASK 0x00700000 -#define IN6PTON_COLON_1 0x00100000 /* single : requested */ -#define IN6PTON_COLON_2 0x00200000 /* second : requested */ -#define IN6PTON_COLON_1_2 0x00400000 /* :: requested */ -#define IN6PTON_DOT 0x00800000 /* . */ -#define IN6PTON_DELIM 0x10000000 -#define IN6PTON_NULL 0x20000000 /* first/tail */ -#define IN6PTON_UNKNOWN 0x40000000 - -static inline int xdigit2bin(char c, int delim) -{ - if (c == delim || c == '\0') - return IN6PTON_DELIM; - if (c == ':') - return IN6PTON_COLON_MASK; - if (c == '.') - return IN6PTON_DOT; - if (c >= '0' && c <= '9') - return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0')); - if (c >= 'a' && c <= 'f') - return (IN6PTON_XDIGIT | (c - 'a' + 10)); - if (c >= 'A' && c <= 'F') - return (IN6PTON_XDIGIT | (c - 'A' + 10)); - if (delim == -1) - return IN6PTON_DELIM; - return IN6PTON_UNKNOWN; -} - -int in4_pton(const char *src, int srclen, - u8 *dst, - int delim, const char **end) -{ - const char *s; - u8 *d; - u8 dbuf[4]; - int ret = 0; - int i; - int w = 0; - - if (srclen < 0) - srclen = strlen(src); - s = src; - d = dbuf; - i = 0; - while(1) { - int c; - c = xdigit2bin(srclen > 0 ? *s : '\0', delim); - if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) { - goto out; - } - if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) { - if (w == 0) - goto out; - *d++ = w & 0xff; - w = 0; - i++; - if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { - if (i != 4) - goto out; - break; - } - goto cont; - } - w = (w * 10) + c; - if ((w & 0xffff) > 255) { - goto out; - } -cont: - if (i >= 4) - goto out; - s++; - srclen--; - } - ret = 1; - memcpy(dst, dbuf, sizeof(dbuf)); -out: - if (end) - *end = s; - return ret; -} - -EXPORT_SYMBOL(in4_pton); - -int in6_pton(const char *src, int srclen, - u8 *dst, - int delim, const char **end) -{ - const char *s, *tok = NULL; - u8 *d, *dc = NULL; - u8 dbuf[16]; - int ret = 0; - int i; - int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL; - int w = 0; - - memset(dbuf, 0, sizeof(dbuf)); - - s = src; - d = dbuf; - if (srclen < 0) - srclen = strlen(src); - - while (1) { - int c; - - c = xdigit2bin(srclen > 0 ? *s : '\0', delim); - if (!(c & state)) - goto out; - if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { - /* process one 16-bit word */ - if (!(state & IN6PTON_NULL)) { - *d++ = (w >> 8) & 0xff; - *d++ = w & 0xff; - } - w = 0; - if (c & IN6PTON_DELIM) { - /* We've processed last word */ - break; - } - /* - * COLON_1 => XDIGIT - * COLON_2 => XDIGIT|DELIM - * COLON_1_2 => COLON_2 - */ - switch (state & IN6PTON_COLON_MASK) { - case IN6PTON_COLON_2: - dc = d; - state = IN6PTON_XDIGIT | IN6PTON_DELIM; - if (dc - dbuf >= sizeof(dbuf)) - state |= IN6PTON_NULL; - break; - case IN6PTON_COLON_1|IN6PTON_COLON_1_2: - state = IN6PTON_XDIGIT | IN6PTON_COLON_2; - break; - case IN6PTON_COLON_1: - state = IN6PTON_XDIGIT; - break; - case IN6PTON_COLON_1_2: - state = IN6PTON_COLON_2; - break; - default: - state = 0; - } - tok = s + 1; - goto cont; - } - - if (c & IN6PTON_DOT) { - ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s); - if (ret > 0) { - d += 4; - break; - } - goto out; - } - - w = (w << 4) | (0xff & c); - state = IN6PTON_COLON_1 | IN6PTON_DELIM; - if (!(w & 0xf000)) { - state |= IN6PTON_XDIGIT; - } - if (!dc && d + 2 < dbuf + sizeof(dbuf)) { - state |= IN6PTON_COLON_1_2; - state &= ~IN6PTON_DELIM; - } - if (d + 2 >= dbuf + sizeof(dbuf)) { - state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2); - } -cont: - if ((dc && d + 4 < dbuf + sizeof(dbuf)) || - d + 4 == dbuf + sizeof(dbuf)) { - state |= IN6PTON_DOT; - } - if (d >= dbuf + sizeof(dbuf)) { - state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK); - } - s++; - srclen--; - } - - i = 15; d--; - - if (dc) { - while(d >= dc) - dst[i--] = *d--; - while(i >= dc - dbuf) - dst[i--] = 0; - while(i >= 0) - dst[i--] = *d--; - } else - memcpy(dst, dbuf, sizeof(dbuf)); - - ret = 1; -out: - if (end) - *end = s; - return ret; -} - -EXPORT_SYMBOL(in6_pton); - -void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, - __be32 from, __be32 to, int pseudohdr) -{ - __be32 diff[] = { ~from, to }; - if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_partial(diff, sizeof(diff), - ~csum_unfold(*sum))); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial(diff, sizeof(diff), - ~skb->csum); - } else if (pseudohdr) - *sum = ~csum_fold(csum_partial(diff, sizeof(diff), - csum_unfold(*sum))); -} -EXPORT_SYMBOL(inet_proto_csum_replace4); diff --git a/libdde_linux26/lib/src/net/netlink/.svn/all-wcprops b/libdde_linux26/lib/src/net/netlink/.svn/all-wcprops deleted file mode 100644 index 0a8a5102..00000000 --- a/libdde_linux26/lib/src/net/netlink/.svn/all-wcprops +++ /dev/null @@ -1,11 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 70 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net/netlink -END -af_netlink.c -K 25 -svn:wc:ra_dav:version-url -V 83 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net/netlink/af_netlink.c -END diff --git a/libdde_linux26/lib/src/net/netlink/.svn/entries b/libdde_linux26/lib/src/net/netlink/.svn/entries deleted file mode 100644 index 402cf7d8..00000000 --- a/libdde_linux26/lib/src/net/netlink/.svn/entries +++ /dev/null @@ -1,62 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/net/netlink -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -af_netlink.c -file - - - - -2009-11-15T17:17:07.000000Z -d6f60c20045cf045ccf3ecffb1d15d18 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -45165 - diff --git a/libdde_linux26/lib/src/net/netlink/.svn/format b/libdde_linux26/lib/src/net/netlink/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/net/netlink/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/net/netlink/.svn/text-base/af_netlink.c.svn-base b/libdde_linux26/lib/src/net/netlink/.svn/text-base/af_netlink.c.svn-base deleted file mode 100644 index 3f00a014..00000000 --- a/libdde_linux26/lib/src/net/netlink/.svn/text-base/af_netlink.c.svn-base +++ /dev/null @@ -1,2013 +0,0 @@ -/* - * NETLINK Kernel-user communication protocol. - * - * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> - * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith - * added netlink_proto_exit - * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br> - * use nlk_sk, as sk->protinfo is on a diet 8) - * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org> - * - inc module use count of module that owns - * the kernel socket in case userspace opens - * socket of same protocol - * - remove all module support, since netlink is - * mandatory if CONFIG_NET=y these days - */ - -#include <linux/module.h> - -#include <linux/capability.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/socket.h> -#include <linux/un.h> -#include <linux/fcntl.h> -#include <linux/termios.h> -#include <linux/sockios.h> -#include <linux/net.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/rtnetlink.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/notifier.h> -#include <linux/security.h> -#include <linux/jhash.h> -#include <linux/jiffies.h> -#include <linux/random.h> -#include <linux/bitops.h> -#include <linux/mm.h> -#include <linux/types.h> -#include <linux/audit.h> -#include <linux/mutex.h> - -#include <net/net_namespace.h> -#include <net/sock.h> -#include <net/scm.h> -#include <net/netlink.h> - -#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) -#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) - -struct netlink_sock { - /* struct sock has to be the first member of netlink_sock */ - struct sock sk; - u32 pid; - u32 dst_pid; - u32 dst_group; - u32 flags; - u32 subscriptions; - u32 ngroups; - unsigned long *groups; - unsigned long state; - wait_queue_head_t wait; - struct netlink_callback *cb; - struct mutex *cb_mutex; - struct mutex cb_def_mutex; - void (*netlink_rcv)(struct sk_buff *skb); - struct module *module; -}; - -#define NETLINK_KERNEL_SOCKET 0x1 -#define NETLINK_RECV_PKTINFO 0x2 - -static inline struct netlink_sock *nlk_sk(struct sock *sk) -{ - return container_of(sk, struct netlink_sock, sk); -} - -static inline int netlink_is_kernel(struct sock *sk) -{ - return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; -} - -struct nl_pid_hash { - struct hlist_head *table; - unsigned long rehash_time; - - unsigned int mask; - unsigned int shift; - - unsigned int entries; - unsigned int max_shift; - - u32 rnd; -}; - -struct netlink_table { - struct nl_pid_hash hash; - struct hlist_head mc_list; - unsigned long *listeners; - unsigned int nl_nonroot; - unsigned int groups; - struct mutex *cb_mutex; - struct module *module; - int registered; -}; - -static struct netlink_table *nl_table; - -static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); - -static int netlink_dump(struct sock *sk); -static void netlink_destroy_callback(struct netlink_callback *cb); - -static DEFINE_RWLOCK(nl_table_lock); -static atomic_t nl_table_users = ATOMIC_INIT(0); - -static ATOMIC_NOTIFIER_HEAD(netlink_chain); - -static u32 netlink_group_mask(u32 group) -{ - return group ? 1 << (group - 1) : 0; -} - -static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid) -{ - return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask]; -} - -static void netlink_sock_destruct(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (nlk->cb) { - if (nlk->cb->done) - nlk->cb->done(nlk->cb); - netlink_destroy_callback(nlk->cb); - } - - skb_queue_purge(&sk->sk_receive_queue); - - if (!sock_flag(sk, SOCK_DEAD)) { - printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); - return; - } - - WARN_ON(atomic_read(&sk->sk_rmem_alloc)); - WARN_ON(atomic_read(&sk->sk_wmem_alloc)); - WARN_ON(nlk_sk(sk)->groups); -} - -/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on - * SMP. Look, when several writers sleep and reader wakes them up, all but one - * immediately hit write lock and grab all the cpus. Exclusive sleep solves - * this, _but_ remember, it adds useless work on UP machines. - */ - -static void netlink_table_grab(void) - __acquires(nl_table_lock) -{ - write_lock_irq(&nl_table_lock); - - if (atomic_read(&nl_table_users)) { - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue_exclusive(&nl_table_wait, &wait); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (atomic_read(&nl_table_users) == 0) - break; - write_unlock_irq(&nl_table_lock); - schedule(); - write_lock_irq(&nl_table_lock); - } - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&nl_table_wait, &wait); - } -} - -static void netlink_table_ungrab(void) - __releases(nl_table_lock) -{ - write_unlock_irq(&nl_table_lock); - wake_up(&nl_table_wait); -} - -static inline void -netlink_lock_table(void) -{ - /* read_lock() synchronizes us to netlink_table_grab */ - - read_lock(&nl_table_lock); - atomic_inc(&nl_table_users); - read_unlock(&nl_table_lock); -} - -static inline void -netlink_unlock_table(void) -{ - if (atomic_dec_and_test(&nl_table_users)) - wake_up(&nl_table_wait); -} - -static inline struct sock *netlink_lookup(struct net *net, int protocol, - u32 pid) -{ - struct nl_pid_hash *hash = &nl_table[protocol].hash; - struct hlist_head *head; - struct sock *sk; - struct hlist_node *node; - - read_lock(&nl_table_lock); - head = nl_pid_hashfn(hash, pid); - sk_for_each(sk, node, head) { - if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->pid == pid)) { - sock_hold(sk); - goto found; - } - } - sk = NULL; -found: - read_unlock(&nl_table_lock); - return sk; -} - -static inline struct hlist_head *nl_pid_hash_zalloc(size_t size) -{ - if (size <= PAGE_SIZE) - return kzalloc(size, GFP_ATOMIC); - else - return (struct hlist_head *) - __get_free_pages(GFP_ATOMIC | __GFP_ZERO, - get_order(size)); -} - -static inline void nl_pid_hash_free(struct hlist_head *table, size_t size) -{ - if (size <= PAGE_SIZE) - kfree(table); - else - free_pages((unsigned long)table, get_order(size)); -} - -static int nl_pid_hash_rehash(struct nl_pid_hash *hash, int grow) -{ - unsigned int omask, mask, shift; - size_t osize, size; - struct hlist_head *otable, *table; - int i; - - omask = mask = hash->mask; - osize = size = (mask + 1) * sizeof(*table); - shift = hash->shift; - - if (grow) { - if (++shift > hash->max_shift) - return 0; - mask = mask * 2 + 1; - size *= 2; - } - - table = nl_pid_hash_zalloc(size); - if (!table) - return 0; - - otable = hash->table; - hash->table = table; - hash->mask = mask; - hash->shift = shift; - get_random_bytes(&hash->rnd, sizeof(hash->rnd)); - - for (i = 0; i <= omask; i++) { - struct sock *sk; - struct hlist_node *node, *tmp; - - sk_for_each_safe(sk, node, tmp, &otable[i]) - __sk_add_node(sk, nl_pid_hashfn(hash, nlk_sk(sk)->pid)); - } - - nl_pid_hash_free(otable, osize); - hash->rehash_time = jiffies + 10 * 60 * HZ; - return 1; -} - -static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len) -{ - int avg = hash->entries >> hash->shift; - - if (unlikely(avg > 1) && nl_pid_hash_rehash(hash, 1)) - return 1; - - if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) { - nl_pid_hash_rehash(hash, 0); - return 1; - } - - return 0; -} - -static const struct proto_ops netlink_ops; - -static void -netlink_update_listeners(struct sock *sk) -{ - struct netlink_table *tbl = &nl_table[sk->sk_protocol]; - struct hlist_node *node; - unsigned long mask; - unsigned int i; - - for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { - mask = 0; - sk_for_each_bound(sk, node, &tbl->mc_list) { - if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) - mask |= nlk_sk(sk)->groups[i]; - } - tbl->listeners[i] = mask; - } - /* this function is only called with the netlink table "grabbed", which - * makes sure updates are visible before bind or setsockopt return. */ -} - -static int netlink_insert(struct sock *sk, struct net *net, u32 pid) -{ - struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; - struct hlist_head *head; - int err = -EADDRINUSE; - struct sock *osk; - struct hlist_node *node; - int len; - - netlink_table_grab(); - head = nl_pid_hashfn(hash, pid); - len = 0; - sk_for_each(osk, node, head) { - if (net_eq(sock_net(osk), net) && (nlk_sk(osk)->pid == pid)) - break; - len++; - } - if (node) - goto err; - - err = -EBUSY; - if (nlk_sk(sk)->pid) - goto err; - - err = -ENOMEM; - if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX)) - goto err; - - if (len && nl_pid_hash_dilute(hash, len)) - head = nl_pid_hashfn(hash, pid); - hash->entries++; - nlk_sk(sk)->pid = pid; - sk_add_node(sk, head); - err = 0; - -err: - netlink_table_ungrab(); - return err; -} - -static void netlink_remove(struct sock *sk) -{ - netlink_table_grab(); - if (sk_del_node_init(sk)) - nl_table[sk->sk_protocol].hash.entries--; - if (nlk_sk(sk)->subscriptions) - __sk_del_bind_node(sk); - netlink_table_ungrab(); -} - -static struct proto netlink_proto = { - .name = "NETLINK", - .owner = THIS_MODULE, - .obj_size = sizeof(struct netlink_sock), -}; - -static int __netlink_create(struct net *net, struct socket *sock, - struct mutex *cb_mutex, int protocol) -{ - struct sock *sk; - struct netlink_sock *nlk; - - sock->ops = &netlink_ops; - - sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); - if (!sk) - return -ENOMEM; - - sock_init_data(sock, sk); - - nlk = nlk_sk(sk); - if (cb_mutex) - nlk->cb_mutex = cb_mutex; - else { - nlk->cb_mutex = &nlk->cb_def_mutex; - mutex_init(nlk->cb_mutex); - } - init_waitqueue_head(&nlk->wait); - - sk->sk_destruct = netlink_sock_destruct; - sk->sk_protocol = protocol; - return 0; -} - -static int netlink_create(struct net *net, struct socket *sock, int protocol) -{ - struct module *module = NULL; - struct mutex *cb_mutex; - struct netlink_sock *nlk; - int err = 0; - - sock->state = SS_UNCONNECTED; - - if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) - return -ESOCKTNOSUPPORT; - - if (protocol < 0 || protocol >= MAX_LINKS) - return -EPROTONOSUPPORT; - - netlink_lock_table(); -#ifdef CONFIG_MODULES - if (!nl_table[protocol].registered) { - netlink_unlock_table(); - request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); - netlink_lock_table(); - } -#endif - if (nl_table[protocol].registered && - try_module_get(nl_table[protocol].module)) - module = nl_table[protocol].module; - cb_mutex = nl_table[protocol].cb_mutex; - netlink_unlock_table(); - - err = __netlink_create(net, sock, cb_mutex, protocol); - if (err < 0) - goto out_module; - - local_bh_disable(); - sock_prot_inuse_add(net, &netlink_proto, 1); - local_bh_enable(); - - nlk = nlk_sk(sock->sk); - nlk->module = module; -out: - return err; - -out_module: - module_put(module); - goto out; -} - -static int netlink_release(struct socket *sock) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk; - - if (!sk) - return 0; - - netlink_remove(sk); - sock_orphan(sk); - nlk = nlk_sk(sk); - - /* - * OK. Socket is unlinked, any packets that arrive now - * will be purged. - */ - - sock->sk = NULL; - wake_up_interruptible_all(&nlk->wait); - - skb_queue_purge(&sk->sk_write_queue); - - if (nlk->pid && !nlk->subscriptions) { - struct netlink_notify n = { - .net = sock_net(sk), - .protocol = sk->sk_protocol, - .pid = nlk->pid, - }; - atomic_notifier_call_chain(&netlink_chain, - NETLINK_URELEASE, &n); - } - - module_put(nlk->module); - - netlink_table_grab(); - if (netlink_is_kernel(sk)) { - BUG_ON(nl_table[sk->sk_protocol].registered == 0); - if (--nl_table[sk->sk_protocol].registered == 0) { - kfree(nl_table[sk->sk_protocol].listeners); - nl_table[sk->sk_protocol].module = NULL; - nl_table[sk->sk_protocol].registered = 0; - } - } else if (nlk->subscriptions) - netlink_update_listeners(sk); - netlink_table_ungrab(); - - kfree(nlk->groups); - nlk->groups = NULL; - - local_bh_disable(); - sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); - local_bh_enable(); - sock_put(sk); - return 0; -} - -static int netlink_autobind(struct socket *sock) -{ - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; - struct hlist_head *head; - struct sock *osk; - struct hlist_node *node; - s32 pid = current->tgid; - int err; - static s32 rover = -4097; - -retry: - cond_resched(); - netlink_table_grab(); - head = nl_pid_hashfn(hash, pid); - sk_for_each(osk, node, head) { - if (!net_eq(sock_net(osk), net)) - continue; - if (nlk_sk(osk)->pid == pid) { - /* Bind collision, search negative pid values. */ - pid = rover--; - if (rover > -4097) - rover = -4097; - netlink_table_ungrab(); - goto retry; - } - } - netlink_table_ungrab(); - - err = netlink_insert(sk, net, pid); - if (err == -EADDRINUSE) - goto retry; - - /* If 2 threads race to autobind, that is fine. */ - if (err == -EBUSY) - err = 0; - - return err; -} - -static inline int netlink_capable(struct socket *sock, unsigned int flag) -{ - return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || - capable(CAP_NET_ADMIN); -} - -static void -netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (nlk->subscriptions && !subscriptions) - __sk_del_bind_node(sk); - else if (!nlk->subscriptions && subscriptions) - sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); - nlk->subscriptions = subscriptions; -} - -static int netlink_realloc_groups(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - unsigned int groups; - unsigned long *new_groups; - int err = 0; - - netlink_table_grab(); - - groups = nl_table[sk->sk_protocol].groups; - if (!nl_table[sk->sk_protocol].registered) { - err = -ENOENT; - goto out_unlock; - } - - if (nlk->ngroups >= groups) - goto out_unlock; - - new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC); - if (new_groups == NULL) { - err = -ENOMEM; - goto out_unlock; - } - memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0, - NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups)); - - nlk->groups = new_groups; - nlk->ngroups = groups; - out_unlock: - netlink_table_ungrab(); - return err; -} - -static int netlink_bind(struct socket *sock, struct sockaddr *addr, - int addr_len) -{ - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; - int err; - - if (nladdr->nl_family != AF_NETLINK) - return -EINVAL; - - /* Only superuser is allowed to listen multicasts */ - if (nladdr->nl_groups) { - if (!netlink_capable(sock, NL_NONROOT_RECV)) - return -EPERM; - err = netlink_realloc_groups(sk); - if (err) - return err; - } - - if (nlk->pid) { - if (nladdr->nl_pid != nlk->pid) - return -EINVAL; - } else { - err = nladdr->nl_pid ? - netlink_insert(sk, net, nladdr->nl_pid) : - netlink_autobind(sock); - if (err) - return err; - } - - if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) - return 0; - - netlink_table_grab(); - netlink_update_subscriptions(sk, nlk->subscriptions + - hweight32(nladdr->nl_groups) - - hweight32(nlk->groups[0])); - nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; - netlink_update_listeners(sk); - netlink_table_ungrab(); - - return 0; -} - -static int netlink_connect(struct socket *sock, struct sockaddr *addr, - int alen, int flags) -{ - int err = 0; - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; - - if (addr->sa_family == AF_UNSPEC) { - sk->sk_state = NETLINK_UNCONNECTED; - nlk->dst_pid = 0; - nlk->dst_group = 0; - return 0; - } - if (addr->sa_family != AF_NETLINK) - return -EINVAL; - - /* Only superuser is allowed to send multicasts */ - if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND)) - return -EPERM; - - if (!nlk->pid) - err = netlink_autobind(sock); - - if (err == 0) { - sk->sk_state = NETLINK_CONNECTED; - nlk->dst_pid = nladdr->nl_pid; - nlk->dst_group = ffs(nladdr->nl_groups); - } - - return err; -} - -static int netlink_getname(struct socket *sock, struct sockaddr *addr, - int *addr_len, int peer) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; - - nladdr->nl_family = AF_NETLINK; - nladdr->nl_pad = 0; - *addr_len = sizeof(*nladdr); - - if (peer) { - nladdr->nl_pid = nlk->dst_pid; - nladdr->nl_groups = netlink_group_mask(nlk->dst_group); - } else { - nladdr->nl_pid = nlk->pid; - nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; - } - return 0; -} - -static void netlink_overrun(struct sock *sk) -{ - if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); - } -} - -static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) -{ - struct sock *sock; - struct netlink_sock *nlk; - - sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, pid); - if (!sock) - return ERR_PTR(-ECONNREFUSED); - - /* Don't bother queuing skb if kernel socket has no input function */ - nlk = nlk_sk(sock); - if (sock->sk_state == NETLINK_CONNECTED && - nlk->dst_pid != nlk_sk(ssk)->pid) { - sock_put(sock); - return ERR_PTR(-ECONNREFUSED); - } - return sock; -} - -struct sock *netlink_getsockbyfilp(struct file *filp) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct sock *sock; - - if (!S_ISSOCK(inode->i_mode)) - return ERR_PTR(-ENOTSOCK); - - sock = SOCKET_I(inode)->sk; - if (sock->sk_family != AF_NETLINK) - return ERR_PTR(-EINVAL); - - sock_hold(sock); - return sock; -} - -/* - * Attach a skb to a netlink socket. - * The caller must hold a reference to the destination socket. On error, the - * reference is dropped. The skb is not send to the destination, just all - * all error checks are performed and memory in the queue is reserved. - * Return values: - * < 0: error. skb freed, reference to sock dropped. - * 0: continue - * 1: repeat lookup - reference dropped while waiting for socket memory. - */ -int netlink_attachskb(struct sock *sk, struct sk_buff *skb, - long *timeo, struct sock *ssk) -{ - struct netlink_sock *nlk; - - nlk = nlk_sk(sk); - - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) { - DECLARE_WAITQUEUE(wait, current); - if (!*timeo) { - if (!ssk || netlink_is_kernel(ssk)) - netlink_overrun(sk); - sock_put(sk); - kfree_skb(skb); - return -EAGAIN; - } - - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&nlk->wait, &wait); - - if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) && - !sock_flag(sk, SOCK_DEAD)) - *timeo = schedule_timeout(*timeo); - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&nlk->wait, &wait); - sock_put(sk); - - if (signal_pending(current)) { - kfree_skb(skb); - return sock_intr_errno(*timeo); - } - return 1; - } - skb_set_owner_r(skb, sk); - return 0; -} - -int netlink_sendskb(struct sock *sk, struct sk_buff *skb) -{ - int len = skb->len; - - skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, len); - sock_put(sk); - return len; -} - -void netlink_detachskb(struct sock *sk, struct sk_buff *skb) -{ - kfree_skb(skb); - sock_put(sk); -} - -static inline struct sk_buff *netlink_trim(struct sk_buff *skb, - gfp_t allocation) -{ - int delta; - - skb_orphan(skb); - - delta = skb->end - skb->tail; - if (delta * 2 < skb->truesize) - return skb; - - if (skb_shared(skb)) { - struct sk_buff *nskb = skb_clone(skb, allocation); - if (!nskb) - return skb; - kfree_skb(skb); - skb = nskb; - } - - if (!pskb_expand_head(skb, 0, -delta, allocation)) - skb->truesize -= delta; - - return skb; -} - -static inline void netlink_rcv_wake(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(0, &nlk->state); - if (!test_bit(0, &nlk->state)) - wake_up_interruptible(&nlk->wait); -} - -static inline int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb) -{ - int ret; - struct netlink_sock *nlk = nlk_sk(sk); - - ret = -ECONNREFUSED; - if (nlk->netlink_rcv != NULL) { - ret = skb->len; - skb_set_owner_r(skb, sk); - nlk->netlink_rcv(skb); - } - kfree_skb(skb); - sock_put(sk); - return ret; -} - -int netlink_unicast(struct sock *ssk, struct sk_buff *skb, - u32 pid, int nonblock) -{ - struct sock *sk; - int err; - long timeo; - - skb = netlink_trim(skb, gfp_any()); - - timeo = sock_sndtimeo(ssk, nonblock); -retry: - sk = netlink_getsockbypid(ssk, pid); - if (IS_ERR(sk)) { - kfree_skb(skb); - return PTR_ERR(sk); - } - if (netlink_is_kernel(sk)) - return netlink_unicast_kernel(sk, skb); - - if (sk_filter(sk, skb)) { - err = skb->len; - kfree_skb(skb); - sock_put(sk); - return err; - } - - err = netlink_attachskb(sk, skb, &timeo, ssk); - if (err == 1) - goto retry; - if (err) - return err; - - return netlink_sendskb(sk, skb); -} -EXPORT_SYMBOL(netlink_unicast); - -int netlink_has_listeners(struct sock *sk, unsigned int group) -{ - int res = 0; - unsigned long *listeners; - - BUG_ON(!netlink_is_kernel(sk)); - - rcu_read_lock(); - listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); - - if (group - 1 < nl_table[sk->sk_protocol].groups) - res = test_bit(group - 1, listeners); - - rcu_read_unlock(); - - return res; -} -EXPORT_SYMBOL_GPL(netlink_has_listeners); - -static inline int netlink_broadcast_deliver(struct sock *sk, - struct sk_buff *skb) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !test_bit(0, &nlk->state)) { - skb_set_owner_r(skb, sk); - skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); - return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf; - } - return -1; -} - -struct netlink_broadcast_data { - struct sock *exclude_sk; - struct net *net; - u32 pid; - u32 group; - int failure; - int congested; - int delivered; - gfp_t allocation; - struct sk_buff *skb, *skb2; -}; - -static inline int do_one_broadcast(struct sock *sk, - struct netlink_broadcast_data *p) -{ - struct netlink_sock *nlk = nlk_sk(sk); - int val; - - if (p->exclude_sk == sk) - goto out; - - if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || - !test_bit(p->group - 1, nlk->groups)) - goto out; - - if (!net_eq(sock_net(sk), p->net)) - goto out; - - if (p->failure) { - netlink_overrun(sk); - goto out; - } - - sock_hold(sk); - if (p->skb2 == NULL) { - if (skb_shared(p->skb)) { - p->skb2 = skb_clone(p->skb, p->allocation); - } else { - p->skb2 = skb_get(p->skb); - /* - * skb ownership may have been set when - * delivered to a previous socket. - */ - skb_orphan(p->skb2); - } - } - if (p->skb2 == NULL) { - netlink_overrun(sk); - /* Clone failed. Notify ALL listeners. */ - p->failure = 1; - } else if (sk_filter(sk, p->skb2)) { - kfree_skb(p->skb2); - p->skb2 = NULL; - } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { - netlink_overrun(sk); - } else { - p->congested |= val; - p->delivered = 1; - p->skb2 = NULL; - } - sock_put(sk); - -out: - return 0; -} - -int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, - u32 group, gfp_t allocation) -{ - struct net *net = sock_net(ssk); - struct netlink_broadcast_data info; - struct hlist_node *node; - struct sock *sk; - - skb = netlink_trim(skb, allocation); - - info.exclude_sk = ssk; - info.net = net; - info.pid = pid; - info.group = group; - info.failure = 0; - info.congested = 0; - info.delivered = 0; - info.allocation = allocation; - info.skb = skb; - info.skb2 = NULL; - - /* While we sleep in clone, do not allow to change socket list */ - - netlink_lock_table(); - - sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) - do_one_broadcast(sk, &info); - - kfree_skb(skb); - - netlink_unlock_table(); - - if (info.skb2) - kfree_skb(info.skb2); - - if (info.delivered) { - if (info.congested && (allocation & __GFP_WAIT)) - yield(); - return 0; - } - if (info.failure) - return -ENOBUFS; - return -ESRCH; -} -EXPORT_SYMBOL(netlink_broadcast); - -struct netlink_set_err_data { - struct sock *exclude_sk; - u32 pid; - u32 group; - int code; -}; - -static inline int do_one_set_err(struct sock *sk, - struct netlink_set_err_data *p) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (sk == p->exclude_sk) - goto out; - - if (sock_net(sk) != sock_net(p->exclude_sk)) - goto out; - - if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || - !test_bit(p->group - 1, nlk->groups)) - goto out; - - sk->sk_err = p->code; - sk->sk_error_report(sk); -out: - return 0; -} - -/** - * netlink_set_err - report error to broadcast listeners - * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() - * @pid: the PID of a process that we want to skip (if any) - * @groups: the broadcast group that will notice the error - * @code: error code, must be negative (as usual in kernelspace) - */ -void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) -{ - struct netlink_set_err_data info; - struct hlist_node *node; - struct sock *sk; - - info.exclude_sk = ssk; - info.pid = pid; - info.group = group; - /* sk->sk_err wants a positive error value */ - info.code = -code; - - read_lock(&nl_table_lock); - - sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) - do_one_set_err(sk, &info); - - read_unlock(&nl_table_lock); -} - -/* must be called with netlink table grabbed */ -static void netlink_update_socket_mc(struct netlink_sock *nlk, - unsigned int group, - int is_new) -{ - int old, new = !!is_new, subscriptions; - - old = test_bit(group - 1, nlk->groups); - subscriptions = nlk->subscriptions - old + new; - if (new) - __set_bit(group - 1, nlk->groups); - else - __clear_bit(group - 1, nlk->groups); - netlink_update_subscriptions(&nlk->sk, subscriptions); - netlink_update_listeners(&nlk->sk); -} - -static int netlink_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, int optlen) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - unsigned int val = 0; - int err; - - if (level != SOL_NETLINK) - return -ENOPROTOOPT; - - if (optlen >= sizeof(int) && - get_user(val, (unsigned int __user *)optval)) - return -EFAULT; - - switch (optname) { - case NETLINK_PKTINFO: - if (val) - nlk->flags |= NETLINK_RECV_PKTINFO; - else - nlk->flags &= ~NETLINK_RECV_PKTINFO; - err = 0; - break; - case NETLINK_ADD_MEMBERSHIP: - case NETLINK_DROP_MEMBERSHIP: { - if (!netlink_capable(sock, NL_NONROOT_RECV)) - return -EPERM; - err = netlink_realloc_groups(sk); - if (err) - return err; - if (!val || val - 1 >= nlk->ngroups) - return -EINVAL; - netlink_table_grab(); - netlink_update_socket_mc(nlk, val, - optname == NETLINK_ADD_MEMBERSHIP); - netlink_table_ungrab(); - err = 0; - break; - } - default: - err = -ENOPROTOOPT; - } - return err; -} - -static int netlink_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - int len, val, err; - - if (level != SOL_NETLINK) - return -ENOPROTOOPT; - - if (get_user(len, optlen)) - return -EFAULT; - if (len < 0) - return -EINVAL; - - switch (optname) { - case NETLINK_PKTINFO: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; - break; - default: - err = -ENOPROTOOPT; - } - return err; -} - -static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) -{ - struct nl_pktinfo info; - - info.group = NETLINK_CB(skb).dst_group; - put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); -} - -static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) -{ - struct sock_iocb *siocb = kiocb_to_siocb(kiocb); - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *addr = msg->msg_name; - u32 dst_pid; - u32 dst_group; - struct sk_buff *skb; - int err; - struct scm_cookie scm; - - if (msg->msg_flags&MSG_OOB) - return -EOPNOTSUPP; - - if (NULL == siocb->scm) - siocb->scm = &scm; - err = scm_send(sock, msg, siocb->scm); - if (err < 0) - return err; - - if (msg->msg_namelen) { - if (addr->nl_family != AF_NETLINK) - return -EINVAL; - dst_pid = addr->nl_pid; - dst_group = ffs(addr->nl_groups); - if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) - return -EPERM; - } else { - dst_pid = nlk->dst_pid; - dst_group = nlk->dst_group; - } - - if (!nlk->pid) { - err = netlink_autobind(sock); - if (err) - goto out; - } - - err = -EMSGSIZE; - if (len > sk->sk_sndbuf - 32) - goto out; - err = -ENOBUFS; - skb = alloc_skb(len, GFP_KERNEL); - if (skb == NULL) - goto out; - - NETLINK_CB(skb).pid = nlk->pid; - NETLINK_CB(skb).dst_group = dst_group; - NETLINK_CB(skb).loginuid = audit_get_loginuid(current); - NETLINK_CB(skb).sessionid = audit_get_sessionid(current); - security_task_getsecid(current, &(NETLINK_CB(skb).sid)); - memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - - /* What can I do? Netlink is asynchronous, so that - we will have to save current capabilities to - check them, when this message will be delivered - to corresponding kernel module. --ANK (980802) - */ - - err = -EFAULT; - if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { - kfree_skb(skb); - goto out; - } - - err = security_netlink_send(sk, skb); - if (err) { - kfree_skb(skb); - goto out; - } - - if (dst_group) { - atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); - } - err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); - -out: - return err; -} - -static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len, - int flags) -{ - struct sock_iocb *siocb = kiocb_to_siocb(kiocb); - struct scm_cookie scm; - struct sock *sk = sock->sk; - struct netlink_sock *nlk = nlk_sk(sk); - int noblock = flags&MSG_DONTWAIT; - size_t copied; - struct sk_buff *skb; - int err; - - if (flags&MSG_OOB) - return -EOPNOTSUPP; - - copied = 0; - - skb = skb_recv_datagram(sk, flags, noblock, &err); - if (skb == NULL) - goto out; - - msg->msg_namelen = 0; - - copied = skb->len; - if (len < copied) { - msg->msg_flags |= MSG_TRUNC; - copied = len; - } - - skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); - - if (msg->msg_name) { - struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name; - addr->nl_family = AF_NETLINK; - addr->nl_pad = 0; - addr->nl_pid = NETLINK_CB(skb).pid; - addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); - msg->msg_namelen = sizeof(*addr); - } - - if (nlk->flags & NETLINK_RECV_PKTINFO) - netlink_cmsg_recv_pktinfo(msg, skb); - - if (NULL == siocb->scm) { - memset(&scm, 0, sizeof(scm)); - siocb->scm = &scm; - } - siocb->scm->creds = *NETLINK_CREDS(skb); - if (flags & MSG_TRUNC) - copied = skb->len; - skb_free_datagram(sk, skb); - - if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) - netlink_dump(sk); - - scm_recv(sock, msg, siocb->scm, flags); -out: - netlink_rcv_wake(sk); - return err ? : copied; -} - -static void netlink_data_ready(struct sock *sk, int len) -{ - BUG(); -} - -/* - * We export these functions to other modules. They provide a - * complete set of kernel non-blocking support for message - * queueing. - */ - -struct sock * -netlink_kernel_create(struct net *net, int unit, unsigned int groups, - void (*input)(struct sk_buff *skb), - struct mutex *cb_mutex, struct module *module) -{ - struct socket *sock; - struct sock *sk; - struct netlink_sock *nlk; - unsigned long *listeners = NULL; - - BUG_ON(!nl_table); - - if (unit < 0 || unit >= MAX_LINKS) - return NULL; - - if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) - return NULL; - - /* - * We have to just have a reference on the net from sk, but don't - * get_net it. Besides, we cannot get and then put the net here. - * So we create one inside init_net and the move it to net. - */ - - if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0) - goto out_sock_release_nosk; - - sk = sock->sk; - sk_change_net(sk, net); - - if (groups < 32) - groups = 32; - - listeners = kzalloc(NLGRPSZ(groups), GFP_KERNEL); - if (!listeners) - goto out_sock_release; - - sk->sk_data_ready = netlink_data_ready; - if (input) - nlk_sk(sk)->netlink_rcv = input; - - if (netlink_insert(sk, net, 0)) - goto out_sock_release; - - nlk = nlk_sk(sk); - nlk->flags |= NETLINK_KERNEL_SOCKET; - - netlink_table_grab(); - if (!nl_table[unit].registered) { - nl_table[unit].groups = groups; - nl_table[unit].listeners = listeners; - nl_table[unit].cb_mutex = cb_mutex; - nl_table[unit].module = module; - nl_table[unit].registered = 1; - } else { - kfree(listeners); - nl_table[unit].registered++; - } - netlink_table_ungrab(); - return sk; - -out_sock_release: - kfree(listeners); - netlink_kernel_release(sk); - return NULL; - -out_sock_release_nosk: - sock_release(sock); - return NULL; -} -EXPORT_SYMBOL(netlink_kernel_create); - - -void -netlink_kernel_release(struct sock *sk) -{ - sk_release_kernel(sk); -} -EXPORT_SYMBOL(netlink_kernel_release); - - -/** - * netlink_change_ngroups - change number of multicast groups - * - * This changes the number of multicast groups that are available - * on a certain netlink family. Note that it is not possible to - * change the number of groups to below 32. Also note that it does - * not implicitly call netlink_clear_multicast_users() when the - * number of groups is reduced. - * - * @sk: The kernel netlink socket, as returned by netlink_kernel_create(). - * @groups: The new number of groups. - */ -int netlink_change_ngroups(struct sock *sk, unsigned int groups) -{ - unsigned long *listeners, *old = NULL; - struct netlink_table *tbl = &nl_table[sk->sk_protocol]; - int err = 0; - - if (groups < 32) - groups = 32; - - netlink_table_grab(); - if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { - listeners = kzalloc(NLGRPSZ(groups), GFP_ATOMIC); - if (!listeners) { - err = -ENOMEM; - goto out_ungrab; - } - old = tbl->listeners; - memcpy(listeners, old, NLGRPSZ(tbl->groups)); - rcu_assign_pointer(tbl->listeners, listeners); - } - tbl->groups = groups; - - out_ungrab: - netlink_table_ungrab(); - synchronize_rcu(); - kfree(old); - return err; -} -EXPORT_SYMBOL(netlink_change_ngroups); - -/** - * netlink_clear_multicast_users - kick off multicast listeners - * - * This function removes all listeners from the given group. - * @ksk: The kernel netlink socket, as returned by - * netlink_kernel_create(). - * @group: The multicast group to clear. - */ -void netlink_clear_multicast_users(struct sock *ksk, unsigned int group) -{ - struct sock *sk; - struct hlist_node *node; - struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; - - netlink_table_grab(); - - sk_for_each_bound(sk, node, &tbl->mc_list) - netlink_update_socket_mc(nlk_sk(sk), group, 0); - - netlink_table_ungrab(); -} -EXPORT_SYMBOL(netlink_clear_multicast_users); - -void netlink_set_nonroot(int protocol, unsigned int flags) -{ - if ((unsigned int)protocol < MAX_LINKS) - nl_table[protocol].nl_nonroot = flags; -} -EXPORT_SYMBOL(netlink_set_nonroot); - -static void netlink_destroy_callback(struct netlink_callback *cb) -{ - if (cb->skb) - kfree_skb(cb->skb); - kfree(cb); -} - -/* - * It looks a bit ugly. - * It would be better to create kernel thread. - */ - -static int netlink_dump(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - struct netlink_callback *cb; - struct sk_buff *skb; - struct nlmsghdr *nlh; - int len, err = -ENOBUFS; - - skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); - if (!skb) - goto errout; - - mutex_lock(nlk->cb_mutex); - - cb = nlk->cb; - if (cb == NULL) { - err = -EINVAL; - goto errout_skb; - } - - len = cb->dump(skb, cb); - - if (len > 0) { - mutex_unlock(nlk->cb_mutex); - - if (sk_filter(sk, skb)) - kfree_skb(skb); - else { - skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); - } - return 0; - } - - nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI); - if (!nlh) - goto errout_skb; - - memcpy(nlmsg_data(nlh), &len, sizeof(len)); - - if (sk_filter(sk, skb)) - kfree_skb(skb); - else { - skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); - } - - if (cb->done) - cb->done(cb); - nlk->cb = NULL; - mutex_unlock(nlk->cb_mutex); - - netlink_destroy_callback(cb); - return 0; - -errout_skb: - mutex_unlock(nlk->cb_mutex); - kfree_skb(skb); -errout: - return err; -} - -int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, - struct nlmsghdr *nlh, - int (*dump)(struct sk_buff *skb, - struct netlink_callback *), - int (*done)(struct netlink_callback *)) -{ -#ifdef DDE_LINUX - return -ENOBUFS; -#else - struct netlink_callback *cb; - struct sock *sk; - struct netlink_sock *nlk; - - cb = kzalloc(sizeof(*cb), GFP_KERNEL); - if (cb == NULL) - return -ENOBUFS; - - cb->dump = dump; - cb->done = done; - cb->nlh = nlh; - atomic_inc(&skb->users); - cb->skb = skb; - - sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).pid); - if (sk == NULL) { - netlink_destroy_callback(cb); - return -ECONNREFUSED; - } - nlk = nlk_sk(sk); - /* A dump is in progress... */ - mutex_lock(nlk->cb_mutex); - if (nlk->cb) { - mutex_unlock(nlk->cb_mutex); - netlink_destroy_callback(cb); - sock_put(sk); - return -EBUSY; - } - nlk->cb = cb; - mutex_unlock(nlk->cb_mutex); - - netlink_dump(sk); - sock_put(sk); - - /* We successfully started a dump, by returning -EINTR we - * signal not to send ACK even if it was requested. - */ - return -EINTR; -#endif -} -EXPORT_SYMBOL(netlink_dump_start); - -void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) -{ - struct sk_buff *skb; - struct nlmsghdr *rep; - struct nlmsgerr *errmsg; - size_t payload = sizeof(*errmsg); - - /* error messages get the original request appened */ - if (err) - payload += nlmsg_len(nlh); - - skb = nlmsg_new(payload, GFP_KERNEL); - if (!skb) { - struct sock *sk; - - sk = netlink_lookup(sock_net(in_skb->sk), - in_skb->sk->sk_protocol, - NETLINK_CB(in_skb).pid); - if (sk) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); - sock_put(sk); - } - return; - } - - rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, - NLMSG_ERROR, sizeof(struct nlmsgerr), 0); - errmsg = nlmsg_data(rep); - errmsg->error = err; - memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh)); - netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); -} -EXPORT_SYMBOL(netlink_ack); - -int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, - struct nlmsghdr *)) -{ - struct nlmsghdr *nlh; - int err; - - while (skb->len >= nlmsg_total_size(0)) { - int msglen; - - nlh = nlmsg_hdr(skb); - err = 0; - - if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) - return 0; - - /* Only requests are handled by the kernel */ - if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) - goto ack; - - /* Skip control messages */ - if (nlh->nlmsg_type < NLMSG_MIN_TYPE) - goto ack; - - err = cb(skb, nlh); - if (err == -EINTR) - goto skip; - -ack: - if (nlh->nlmsg_flags & NLM_F_ACK || err) - netlink_ack(skb, nlh, err); - -skip: - msglen = NLMSG_ALIGN(nlh->nlmsg_len); - if (msglen > skb->len) - msglen = skb->len; - skb_pull(skb, msglen); - } - - return 0; -} -EXPORT_SYMBOL(netlink_rcv_skb); - -/** - * nlmsg_notify - send a notification netlink message - * @sk: netlink socket to use - * @skb: notification message - * @pid: destination netlink pid for reports or 0 - * @group: destination multicast group or 0 - * @report: 1 to report back, 0 to disable - * @flags: allocation flags - */ -int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid, - unsigned int group, int report, gfp_t flags) -{ - int err = 0; - - if (group) { - int exclude_pid = 0; - - if (report) { - atomic_inc(&skb->users); - exclude_pid = pid; - } - - /* errors reported via destination sk->sk_err */ - nlmsg_multicast(sk, skb, exclude_pid, group, flags); - } - - if (report) - err = nlmsg_unicast(sk, skb, pid); - - return err; -} -EXPORT_SYMBOL(nlmsg_notify); - -#ifdef CONFIG_PROC_FS -struct nl_seq_iter { - struct seq_net_private p; - int link; - int hash_idx; -}; - -static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) -{ - struct nl_seq_iter *iter = seq->private; - int i, j; - struct sock *s; - struct hlist_node *node; - loff_t off = 0; - - for (i = 0; i < MAX_LINKS; i++) { - struct nl_pid_hash *hash = &nl_table[i].hash; - - for (j = 0; j <= hash->mask; j++) { - sk_for_each(s, node, &hash->table[j]) { - if (sock_net(s) != seq_file_net(seq)) - continue; - if (off == pos) { - iter->link = i; - iter->hash_idx = j; - return s; - } - ++off; - } - } - } - return NULL; -} - -static void *netlink_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(nl_table_lock) -{ - read_lock(&nl_table_lock); - return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN; -} - -static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct sock *s; - struct nl_seq_iter *iter; - int i, j; - - ++*pos; - - if (v == SEQ_START_TOKEN) - return netlink_seq_socket_idx(seq, 0); - - iter = seq->private; - s = v; - do { - s = sk_next(s); - } while (s && sock_net(s) != seq_file_net(seq)); - if (s) - return s; - - i = iter->link; - j = iter->hash_idx + 1; - - do { - struct nl_pid_hash *hash = &nl_table[i].hash; - - for (; j <= hash->mask; j++) { - s = sk_head(&hash->table[j]); - while (s && sock_net(s) != seq_file_net(seq)) - s = sk_next(s); - if (s) { - iter->link = i; - iter->hash_idx = j; - return s; - } - } - - j = 0; - } while (++i < MAX_LINKS); - - return NULL; -} - -static void netlink_seq_stop(struct seq_file *seq, void *v) - __releases(nl_table_lock) -{ - read_unlock(&nl_table_lock); -} - - -static int netlink_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, - "sk Eth Pid Groups " - "Rmem Wmem Dump Locks\n"); - else { - struct sock *s = v; - struct netlink_sock *nlk = nlk_sk(s); - - seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %d\n", - s, - s->sk_protocol, - nlk->pid, - nlk->groups ? (u32)nlk->groups[0] : 0, - atomic_read(&s->sk_rmem_alloc), - atomic_read(&s->sk_wmem_alloc), - nlk->cb, - atomic_read(&s->sk_refcnt) - ); - - } - return 0; -} - -static const struct seq_operations netlink_seq_ops = { - .start = netlink_seq_start, - .next = netlink_seq_next, - .stop = netlink_seq_stop, - .show = netlink_seq_show, -}; - - -static int netlink_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &netlink_seq_ops, - sizeof(struct nl_seq_iter)); -} - -static const struct file_operations netlink_seq_fops = { - .owner = THIS_MODULE, - .open = netlink_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -#endif - -int netlink_register_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_register(&netlink_chain, nb); -} -EXPORT_SYMBOL(netlink_register_notifier); - -int netlink_unregister_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&netlink_chain, nb); -} -EXPORT_SYMBOL(netlink_unregister_notifier); - -static const struct proto_ops netlink_ops = { - .family = PF_NETLINK, - .owner = THIS_MODULE, - .release = netlink_release, - .bind = netlink_bind, - .connect = netlink_connect, - .socketpair = sock_no_socketpair, - .accept = sock_no_accept, - .getname = netlink_getname, - .poll = datagram_poll, - .ioctl = sock_no_ioctl, - .listen = sock_no_listen, - .shutdown = sock_no_shutdown, - .setsockopt = netlink_setsockopt, - .getsockopt = netlink_getsockopt, - .sendmsg = netlink_sendmsg, - .recvmsg = netlink_recvmsg, - .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, -}; - -static struct net_proto_family netlink_family_ops = { - .family = PF_NETLINK, - .create = netlink_create, - .owner = THIS_MODULE, /* for consistency 8) */ -}; - -static int __net_init netlink_net_init(struct net *net) -{ -#ifdef CONFIG_PROC_FS - if (!proc_net_fops_create(net, "netlink", 0, &netlink_seq_fops)) - return -ENOMEM; -#endif - return 0; -} - -static void __net_exit netlink_net_exit(struct net *net) -{ -#ifdef CONFIG_PROC_FS - proc_net_remove(net, "netlink"); -#endif -} - -static struct pernet_operations __net_initdata netlink_net_ops = { - .init = netlink_net_init, - .exit = netlink_net_exit, -}; - -static int __init netlink_proto_init(void) -{ - struct sk_buff *dummy_skb; - int i; - unsigned long limit; - unsigned int order; - int err = proto_register(&netlink_proto, 0); - - if (err != 0) - goto out; - - BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb)); - - nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); - if (!nl_table) - goto panic; - - if (num_physpages >= (128 * 1024)) - limit = num_physpages >> (21 - PAGE_SHIFT); - else - limit = num_physpages >> (23 - PAGE_SHIFT); - - order = get_bitmask_order(limit) - 1 + PAGE_SHIFT; - limit = (1UL << order) / sizeof(struct hlist_head); - order = get_bitmask_order(min(limit, (unsigned long)UINT_MAX)) - 1; - - for (i = 0; i < MAX_LINKS; i++) { - struct nl_pid_hash *hash = &nl_table[i].hash; - - hash->table = nl_pid_hash_zalloc(1 * sizeof(*hash->table)); - if (!hash->table) { - while (i-- > 0) - nl_pid_hash_free(nl_table[i].hash.table, - 1 * sizeof(*hash->table)); - kfree(nl_table); - goto panic; - } - hash->max_shift = order; - hash->shift = 0; - hash->mask = 0; - hash->rehash_time = jiffies; - } - - sock_register(&netlink_family_ops); - register_pernet_subsys(&netlink_net_ops); - /* The netlink device handler may be needed early. */ - rtnetlink_init(); -out: - return err; -panic: - panic("netlink_init: Cannot allocate nl_table\n"); -} - -core_initcall(netlink_proto_init); diff --git a/libdde_linux26/lib/src/net/sched/.svn/all-wcprops b/libdde_linux26/lib/src/net/sched/.svn/all-wcprops deleted file mode 100644 index 7ca1df0b..00000000 --- a/libdde_linux26/lib/src/net/sched/.svn/all-wcprops +++ /dev/null @@ -1,11 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 68 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net/sched -END -sch_generic.c -K 25 -svn:wc:ra_dav:version-url -V 82 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src/net/sched/sch_generic.c -END diff --git a/libdde_linux26/lib/src/net/sched/.svn/entries b/libdde_linux26/lib/src/net/sched/.svn/entries deleted file mode 100644 index b59bf68f..00000000 --- a/libdde_linux26/lib/src/net/sched/.svn/entries +++ /dev/null @@ -1,62 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/net/sched -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -sch_generic.c -file - - - - -2009-11-15T17:17:07.000000Z -6ca4e60f13182fd1229b0dbff53f04f1 -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -17494 - diff --git a/libdde_linux26/lib/src/net/sched/.svn/format b/libdde_linux26/lib/src/net/sched/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/net/sched/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src/net/sched/.svn/text-base/sch_generic.c.svn-base b/libdde_linux26/lib/src/net/sched/.svn/text-base/sch_generic.c.svn-base deleted file mode 100644 index a2acd6c4..00000000 --- a/libdde_linux26/lib/src/net/sched/.svn/text-base/sch_generic.c.svn-base +++ /dev/null @@ -1,749 +0,0 @@ -/* - * net/sched/sch_generic.c Generic packet scheduler routines. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * Jamal Hadi Salim, <hadi@cyberus.ca> 990601 - * - Ingress support - */ - -#include <linux/bitops.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/rtnetlink.h> -#include <linux/init.h> -#include <linux/rcupdate.h> -#include <linux/list.h> -#include <net/pkt_sched.h> - -#ifdef DDE_LINUX -#include "local.h" -#endif - -/* Main transmission queue. */ - -/* Modifications to data participating in scheduling must be protected with - * qdisc_lock(qdisc) spinlock. - * - * The idea is the following: - * - enqueue, dequeue are serialized via qdisc root lock - * - ingress filtering is also serialized via qdisc root lock - * - updates to tree and tree walking are only done under the rtnl mutex. - */ - -static inline int qdisc_qlen(struct Qdisc *q) -{ - return q->q.qlen; -} - -static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) -{ - q->gso_skb = skb; - q->qstats.requeues++; - __netif_schedule(q); - - return 0; -} - -static inline struct sk_buff *dequeue_skb(struct Qdisc *q) -{ - struct sk_buff *skb = q->gso_skb; - - if (unlikely(skb)) { - struct net_device *dev = qdisc_dev(q); - struct netdev_queue *txq; - - /* check the reason of requeuing without tx lock first */ - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq)) - q->gso_skb = NULL; - else - skb = NULL; - } else { - skb = q->dequeue(q); - } - - return skb; -} - -static inline int handle_dev_cpu_collision(struct sk_buff *skb, - struct netdev_queue *dev_queue, - struct Qdisc *q) -{ - int ret; - - if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) { - /* - * Same CPU holding the lock. It may be a transient - * configuration error, when hard_start_xmit() recurses. We - * detect it by checking xmit owner and drop the packet when - * deadloop is detected. Return OK to try the next skb. - */ - kfree_skb(skb); - if (net_ratelimit()) - printk(KERN_WARNING "Dead loop on netdevice %s, " - "fix it urgently!\n", dev_queue->dev->name); - ret = qdisc_qlen(q); - } else { - /* - * Another cpu is holding lock, requeue & delay xmits for - * some time. - */ - __get_cpu_var(netdev_rx_stat).cpu_collision++; - ret = dev_requeue_skb(skb, q); - } - - return ret; -} - -/* - * NOTE: Called under qdisc_lock(q) with locally disabled BH. - * - * __QDISC_STATE_RUNNING guarantees only one CPU can process - * this qdisc at a time. qdisc_lock(q) serializes queue accesses for - * this queue. - * - * netif_tx_lock serializes accesses to device driver. - * - * qdisc_lock(q) and netif_tx_lock are mutually exclusive, - * if one is grabbed, another must be free. - * - * Note, that this procedure can be called by a watchdog timer - * - * Returns to the caller: - * 0 - queue is empty or throttled. - * >0 - queue is not empty. - * - */ -static inline int qdisc_restart(struct Qdisc *q) -{ - struct netdev_queue *txq; - int ret = NETDEV_TX_BUSY; - struct net_device *dev; - spinlock_t *root_lock; - struct sk_buff *skb; - - /* Dequeue packet */ - if (unlikely((skb = dequeue_skb(q)) == NULL)) - return 0; - - root_lock = qdisc_lock(q); - - /* And release qdisc */ - spin_unlock(root_lock); - - dev = qdisc_dev(q); - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - - HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_tx_queue_stopped(txq) && - !netif_tx_queue_frozen(txq)) - ret = dev_hard_start_xmit(skb, dev, txq); - HARD_TX_UNLOCK(dev, txq); - - spin_lock(root_lock); - - switch (ret) { - case NETDEV_TX_OK: - /* Driver sent out skb successfully */ - ret = qdisc_qlen(q); - break; - - case NETDEV_TX_LOCKED: - /* Driver try lock failed */ - ret = handle_dev_cpu_collision(skb, txq, q); - break; - - default: - /* Driver returned NETDEV_TX_BUSY - requeue skb */ - if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit())) - printk(KERN_WARNING "BUG %s code %d qlen %d\n", - dev->name, ret, q->q.qlen); - - ret = dev_requeue_skb(skb, q); - break; - } - - if (ret && (netif_tx_queue_stopped(txq) || - netif_tx_queue_frozen(txq))) - ret = 0; - - return ret; -} - -void __qdisc_run(struct Qdisc *q) -{ - unsigned long start_time = jiffies; - - while (qdisc_restart(q)) { - /* - * Postpone processing if - * 1. another process needs the CPU; - * 2. we've been doing it for too long. - */ - if (need_resched() || jiffies != start_time) { - __netif_schedule(q); - break; - } - } - - clear_bit(__QDISC_STATE_RUNNING, &q->state); -} - -static void dev_watchdog(unsigned long arg) -{ - struct net_device *dev = (struct net_device *)arg; - - netif_tx_lock(dev); - if (!qdisc_tx_is_noop(dev)) { - if (netif_device_present(dev) && - netif_running(dev) && - netif_carrier_ok(dev)) { - int some_queue_stopped = 0; - unsigned int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *txq; - - txq = netdev_get_tx_queue(dev, i); - if (netif_tx_queue_stopped(txq)) { - some_queue_stopped = 1; - break; - } - } - - if (some_queue_stopped && - time_after(jiffies, (dev->trans_start + - dev->watchdog_timeo))) { - char drivername[64]; - WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit timed out\n", - dev->name, netdev_drivername(dev, drivername, 64)); - dev->netdev_ops->ndo_tx_timeout(dev); - } - if (!mod_timer(&dev->watchdog_timer, - round_jiffies(jiffies + - dev->watchdog_timeo))) - dev_hold(dev); - } - } - netif_tx_unlock(dev); - - dev_put(dev); -} - -void __netdev_watchdog_up(struct net_device *dev) -{ - if (dev->netdev_ops->ndo_tx_timeout) { - if (dev->watchdog_timeo <= 0) - dev->watchdog_timeo = 5*HZ; - if (!mod_timer(&dev->watchdog_timer, - round_jiffies(jiffies + dev->watchdog_timeo))) - dev_hold(dev); - } -} - -static void dev_watchdog_up(struct net_device *dev) -{ - __netdev_watchdog_up(dev); -} - -static void dev_watchdog_down(struct net_device *dev) -{ - netif_tx_lock_bh(dev); - if (del_timer(&dev->watchdog_timer)) - dev_put(dev); - netif_tx_unlock_bh(dev); -} - -/** - * netif_carrier_on - set carrier - * @dev: network device - * - * Device has detected that carrier. - */ -void netif_carrier_on(struct net_device *dev) -{ - if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { - if (dev->reg_state == NETREG_UNINITIALIZED) - return; - linkwatch_fire_event(dev); - if (netif_running(dev)) - __netdev_watchdog_up(dev); - } -} -EXPORT_SYMBOL(netif_carrier_on); - -/** - * netif_carrier_off - clear carrier - * @dev: network device - * - * Device has detected loss of carrier. - */ -void netif_carrier_off(struct net_device *dev) -{ - if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { - if (dev->reg_state == NETREG_UNINITIALIZED) - return; - linkwatch_fire_event(dev); - } -} -EXPORT_SYMBOL(netif_carrier_off); - -/* "NOOP" scheduler: the best scheduler, recommended for all interfaces - under all circumstances. It is difficult to invent anything faster or - cheaper. - */ - -static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) -{ - kfree_skb(skb); - return NET_XMIT_CN; -} - -static struct sk_buff *noop_dequeue(struct Qdisc * qdisc) -{ - return NULL; -} - -struct Qdisc_ops noop_qdisc_ops __read_mostly = { - .id = "noop", - .priv_size = 0, - .enqueue = noop_enqueue, - .dequeue = noop_dequeue, - .peek = noop_dequeue, - .owner = THIS_MODULE, -}; - -static struct netdev_queue noop_netdev_queue = { - .qdisc = &noop_qdisc, - .qdisc_sleeping = &noop_qdisc, -}; - -struct Qdisc noop_qdisc = { - .enqueue = noop_enqueue, - .dequeue = noop_dequeue, - .flags = TCQ_F_BUILTIN, - .ops = &noop_qdisc_ops, - .list = LIST_HEAD_INIT(noop_qdisc.list), - .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), - .dev_queue = &noop_netdev_queue, -}; -EXPORT_SYMBOL(noop_qdisc); - -static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { - .id = "noqueue", - .priv_size = 0, - .enqueue = noop_enqueue, - .dequeue = noop_dequeue, - .peek = noop_dequeue, - .owner = THIS_MODULE, -}; - -static struct Qdisc noqueue_qdisc; -static struct netdev_queue noqueue_netdev_queue = { - .qdisc = &noqueue_qdisc, - .qdisc_sleeping = &noqueue_qdisc, -}; - -static struct Qdisc noqueue_qdisc = { - .enqueue = NULL, - .dequeue = noop_dequeue, - .flags = TCQ_F_BUILTIN, - .ops = &noqueue_qdisc_ops, - .list = LIST_HEAD_INIT(noqueue_qdisc.list), - .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), - .dev_queue = &noqueue_netdev_queue, -}; - - -static const u8 prio2band[TC_PRIO_MAX+1] = - { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; - -/* 3-band FIFO queue: old style, but should be a bit faster than - generic prio+fifo combination. - */ - -#define PFIFO_FAST_BANDS 3 - -static inline struct sk_buff_head *prio2list(struct sk_buff *skb, - struct Qdisc *qdisc) -{ - struct sk_buff_head *list = qdisc_priv(qdisc); - return list + prio2band[skb->priority & TC_PRIO_MAX]; -} - -static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) -{ - struct sk_buff_head *list = prio2list(skb, qdisc); - - if (skb_queue_len(list) < qdisc_dev(qdisc)->tx_queue_len) { - qdisc->q.qlen++; - return __qdisc_enqueue_tail(skb, qdisc, list); - } - - return qdisc_drop(skb, qdisc); -} - -static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc) -{ - int prio; - struct sk_buff_head *list = qdisc_priv(qdisc); - - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { - if (!skb_queue_empty(list + prio)) { - qdisc->q.qlen--; - return __qdisc_dequeue_head(qdisc, list + prio); - } - } - - return NULL; -} - -static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc) -{ - int prio; - struct sk_buff_head *list = qdisc_priv(qdisc); - - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { - if (!skb_queue_empty(list + prio)) - return skb_peek(list + prio); - } - - return NULL; -} - -static void pfifo_fast_reset(struct Qdisc* qdisc) -{ - int prio; - struct sk_buff_head *list = qdisc_priv(qdisc); - - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(qdisc, list + prio); - - qdisc->qstats.backlog = 0; - qdisc->q.qlen = 0; -} - -static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) -{ - struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; - -#ifndef DDE_LINUX - memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); -#else - WARN_UNIMPL; -#endif - return skb->len; - -nla_put_failure: - return -1; -} - -static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) -{ - int prio; - struct sk_buff_head *list = qdisc_priv(qdisc); - - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - skb_queue_head_init(list + prio); - - return 0; -} - -static struct Qdisc_ops pfifo_fast_ops __read_mostly = { - .id = "pfifo_fast", - .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head), - .enqueue = pfifo_fast_enqueue, - .dequeue = pfifo_fast_dequeue, - .peek = pfifo_fast_peek, - .init = pfifo_fast_init, - .reset = pfifo_fast_reset, - .dump = pfifo_fast_dump, - .owner = THIS_MODULE, -}; - -struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops) -{ - void *p; - struct Qdisc *sch; - unsigned int size; - int err = -ENOBUFS; - - /* ensure that the Qdisc and the private data are 32-byte aligned */ - size = QDISC_ALIGN(sizeof(*sch)); - size += ops->priv_size + (QDISC_ALIGNTO - 1); - - p = kzalloc(size, GFP_KERNEL); - if (!p) - goto errout; - sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); - sch->padded = (char *) sch - (char *) p; - - INIT_LIST_HEAD(&sch->list); - skb_queue_head_init(&sch->q); - sch->ops = ops; - sch->enqueue = ops->enqueue; - sch->dequeue = ops->dequeue; - sch->dev_queue = dev_queue; - dev_hold(qdisc_dev(sch)); - atomic_set(&sch->refcnt, 1); - - return sch; -errout: - return ERR_PTR(err); -} - -struct Qdisc * qdisc_create_dflt(struct net_device *dev, - struct netdev_queue *dev_queue, - struct Qdisc_ops *ops, - unsigned int parentid) -{ - struct Qdisc *sch; - - sch = qdisc_alloc(dev_queue, ops); - if (IS_ERR(sch)) - goto errout; - sch->parent = parentid; - - if (!ops->init || ops->init(sch, NULL) == 0) - return sch; - - qdisc_destroy(sch); -errout: - return NULL; -} -EXPORT_SYMBOL(qdisc_create_dflt); - -/* Under qdisc_lock(qdisc) and BH! */ - -void qdisc_reset(struct Qdisc *qdisc) -{ - const struct Qdisc_ops *ops = qdisc->ops; - - if (ops->reset) - ops->reset(qdisc); - - kfree_skb(qdisc->gso_skb); - qdisc->gso_skb = NULL; -} -EXPORT_SYMBOL(qdisc_reset); - -void qdisc_destroy(struct Qdisc *qdisc) -{ - const struct Qdisc_ops *ops = qdisc->ops; - - if (qdisc->flags & TCQ_F_BUILTIN || - !atomic_dec_and_test(&qdisc->refcnt)) - return; - -#ifdef CONFIG_NET_SCHED -#ifndef DDE_LINUX - qdisc_list_del(qdisc); - - qdisc_put_stab(qdisc->stab); - gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); -#endif -#endif - if (ops->reset) - ops->reset(qdisc); - if (ops->destroy) - ops->destroy(qdisc); - - module_put(ops->owner); - dev_put(qdisc_dev(qdisc)); - - kfree_skb(qdisc->gso_skb); - kfree((char *) qdisc - qdisc->padded); -} -EXPORT_SYMBOL(qdisc_destroy); - -static bool dev_all_qdisc_sleeping_noop(struct net_device *dev) -{ - unsigned int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - - if (txq->qdisc_sleeping != &noop_qdisc) - return false; - } - return true; -} - -static void attach_one_default_qdisc(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_unused) -{ - struct Qdisc *qdisc; - - if (dev->tx_queue_len) { - qdisc = qdisc_create_dflt(dev, dev_queue, - &pfifo_fast_ops, TC_H_ROOT); - if (!qdisc) { - printk(KERN_INFO "%s: activation failed\n", dev->name); - return; - } - } else { - qdisc = &noqueue_qdisc; - } - dev_queue->qdisc_sleeping = qdisc; -} - -static void transition_one_qdisc(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_need_watchdog) -{ - struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping; - int *need_watchdog_p = _need_watchdog; - - if (!(new_qdisc->flags & TCQ_F_BUILTIN)) - clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); - - rcu_assign_pointer(dev_queue->qdisc, new_qdisc); - if (need_watchdog_p && new_qdisc != &noqueue_qdisc) - *need_watchdog_p = 1; -} - -void dev_activate(struct net_device *dev) -{ - int need_watchdog; - - /* No queueing discipline is attached to device; - create default one i.e. pfifo_fast for devices, - which need queueing and noqueue_qdisc for - virtual interfaces - */ - - if (dev_all_qdisc_sleeping_noop(dev)) - netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); - - if (!netif_carrier_ok(dev)) - /* Delay activation until next carrier-on event */ - return; - - need_watchdog = 0; - netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); - transition_one_qdisc(dev, &dev->rx_queue, NULL); - - if (need_watchdog) { - dev->trans_start = jiffies; - dev_watchdog_up(dev); - } -} - -static void dev_deactivate_queue(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_qdisc_default) -{ - struct Qdisc *qdisc_default = _qdisc_default; - struct Qdisc *qdisc; - - qdisc = dev_queue->qdisc; - if (qdisc) { - spin_lock_bh(qdisc_lock(qdisc)); - - if (!(qdisc->flags & TCQ_F_BUILTIN)) - set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); - - rcu_assign_pointer(dev_queue->qdisc, qdisc_default); - qdisc_reset(qdisc); - - spin_unlock_bh(qdisc_lock(qdisc)); - } -} - -static bool some_qdisc_is_busy(struct net_device *dev) -{ - unsigned int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *dev_queue; - spinlock_t *root_lock; - struct Qdisc *q; - int val; - - dev_queue = netdev_get_tx_queue(dev, i); - q = dev_queue->qdisc_sleeping; - root_lock = qdisc_lock(q); - - spin_lock_bh(root_lock); - - val = (test_bit(__QDISC_STATE_RUNNING, &q->state) || - test_bit(__QDISC_STATE_SCHED, &q->state)); - - spin_unlock_bh(root_lock); - - if (val) - return true; - } - return false; -} - -void dev_deactivate(struct net_device *dev) -{ - netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc); - dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc); - - dev_watchdog_down(dev); - -#ifndef DDE_LINUX - /* Wait for outstanding qdisc-less dev_queue_xmit calls. */ - synchronize_rcu(); -#endif - - /* Wait for outstanding qdisc_run calls. */ - while (some_qdisc_is_busy(dev)) - yield(); -} - -static void dev_init_scheduler_queue(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_qdisc) -{ - struct Qdisc *qdisc = _qdisc; - - dev_queue->qdisc = qdisc; - dev_queue->qdisc_sleeping = qdisc; -} - -void dev_init_scheduler(struct net_device *dev) -{ - netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); - dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc); - - setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev); -} - -static void shutdown_scheduler_queue(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_qdisc_default) -{ - struct Qdisc *qdisc = dev_queue->qdisc_sleeping; - struct Qdisc *qdisc_default = _qdisc_default; - - if (qdisc) { - rcu_assign_pointer(dev_queue->qdisc, qdisc_default); - dev_queue->qdisc_sleeping = qdisc_default; - - qdisc_destroy(qdisc); - } -} - -void dev_shutdown(struct net_device *dev) -{ - netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); - shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc); - WARN_ON(timer_pending(&dev->watchdog_timer)); -} diff --git a/libdde_linux26/lib/src/security/.svn/all-wcprops b/libdde_linux26/lib/src/security/.svn/all-wcprops deleted file mode 100644 index 5a54e50c..00000000 --- a/libdde_linux26/lib/src/security/.svn/all-wcprops +++ /dev/null @@ -1,5 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 67 -/repos/tudos/!svn/ver/174/trunk/l4/pkg/dde/linux26/lib/src/security -END diff --git a/libdde_linux26/lib/src/security/.svn/entries b/libdde_linux26/lib/src/security/.svn/entries deleted file mode 100644 index 2fc734cc..00000000 --- a/libdde_linux26/lib/src/security/.svn/entries +++ /dev/null @@ -1,28 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src/security -http://svn.tudos.org/repos/tudos - - - -2007-09-08T19:44:13.897747Z -174 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - diff --git a/libdde_linux26/lib/src/security/.svn/format b/libdde_linux26/lib/src/security/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src/security/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src_ip/.svn/all-wcprops b/libdde_linux26/lib/src_ip/.svn/all-wcprops deleted file mode 100644 index d04375ed..00000000 --- a/libdde_linux26/lib/src_ip/.svn/all-wcprops +++ /dev/null @@ -1,17 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 61 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src_ip -END -broken -K 25 -svn:wc:ra_dav:version-url -V 68 -/repos/tudos/!svn/ver/455/trunk/l4/pkg/dde/linux26/lib/src_ip/broken -END -Makefile -K 25 -svn:wc:ra_dav:version-url -V 70 -/repos/tudos/!svn/ver/322/trunk/l4/pkg/dde/linux26/lib/src_ip/Makefile -END diff --git a/libdde_linux26/lib/src_ip/.svn/entries b/libdde_linux26/lib/src_ip/.svn/entries deleted file mode 100644 index f5379f5f..00000000 --- a/libdde_linux26/lib/src_ip/.svn/entries +++ /dev/null @@ -1,99 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src_ip -http://svn.tudos.org/repos/tudos - - - -2009-05-20T14:32:55.606606Z -455 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -linux -dir - -broken -file - - - - -2009-11-15T17:17:04.000000Z -d41d8cd98f00b204e9800998ecf8427e -2009-05-20T14:32:55.606606Z -455 -l4check - - - - - - - - - - - - - - - - - - - - - -0 - -Makefile -file - - - - -2009-11-15T17:17:04.000000Z -7e808a8830267ec522311a3fa05f9776 -2008-03-18T03:51:56.301196Z -322 -l4check - - - - - - - - - - - - - - - - - - - - - -1774 - diff --git a/libdde_linux26/lib/src_ip/.svn/format b/libdde_linux26/lib/src_ip/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src_ip/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src_ip/.svn/text-base/Makefile.svn-base b/libdde_linux26/lib/src_ip/.svn/text-base/Makefile.svn-base deleted file mode 100644 index 9665ff9b..00000000 --- a/libdde_linux26/lib/src_ip/.svn/text-base/Makefile.svn-base +++ /dev/null @@ -1,52 +0,0 @@ -PKGDIR ?= ../../.. -L4DIR ?= $(PKGDIR)/../.. -CONTRIB ?= $(PKGDIR)/linux26/contrib - --include $(PKGDIR_OBJ)/Makeconf - -ifeq ($(CONFIG_DDE26_NET),y) -TARGET += libdde_linux26_net_ip.a -endif - -SYSTEMS = x86-l4v2 - -ifeq ($(ARCH), x86) -ARCH_DIR = arch/i386 -endif - -# contrib sources are in $(CONTRIB) -vpath %.c $(CONTRIB) $(PKGDIR)/linux26/lib/src -vpath %.S $(CONTRIB) $(PKGDIR)/linux26/lib/src - -PRIVATE_INCDIR += $(CONTRIB)/drivers/pci $(PKGDIR)/linux26/lib/src/arch/l4 \ - $(CONTRIB)/$(ARCH_DIR)/pci $(CONTRIB)/drivers/base/ \ - $(CONTRIB)/lib $(PKGDIR_OBJ) $(CONTRIB)/net/core - -ifeq ($(ARCH), x86) -SRC_S_libdde_linux26_net_ip.a += $(ARCH_DIR)/lib/checksum.S -endif - -SRC_C_libdde_linux26_net_ip.a += \ - arch/l4/net.c \ - drivers/net/mii.c \ - net/core/skbuff.c \ - net/core/utils.c \ - net/core/dev.c \ - net/core/ethtool.c \ - net/core/link_watch.c \ - net/core/dev_mcast.c \ - net/core/neighbour.c \ - net/core/netevent.c \ - net/ethernet/eth.c \ - net/sched/sch_generic.c \ - arch/l4/inodes.c \ - mm/memory.c \ - net/core/filter.c \ - net/core/rtnetlink.c \ - net/core/sock.c \ - net/netlink/af_netlink.c \ - net/netlink/attr.c - -include $(PKGDIR)/linux26/Makeconf - -include $(L4DIR)/mk/lib.mk diff --git a/libdde_linux26/lib/src_ip/.svn/text-base/broken.svn-base b/libdde_linux26/lib/src_ip/.svn/text-base/broken.svn-base deleted file mode 100644 index e69de29b..00000000 --- a/libdde_linux26/lib/src_ip/.svn/text-base/broken.svn-base +++ /dev/null diff --git a/libdde_linux26/lib/src_ip/linux/.svn/all-wcprops b/libdde_linux26/lib/src_ip/linux/.svn/all-wcprops deleted file mode 100644 index 7b75c705..00000000 --- a/libdde_linux26/lib/src_ip/linux/.svn/all-wcprops +++ /dev/null @@ -1,11 +0,0 @@ -K 25 -svn:wc:ra_dav:version-url -V 67 -/repos/tudos/!svn/ver/322/trunk/l4/pkg/dde/linux26/lib/src_ip/linux -END -autoconf.h -K 25 -svn:wc:ra_dav:version-url -V 78 -/repos/tudos/!svn/ver/322/trunk/l4/pkg/dde/linux26/lib/src_ip/linux/autoconf.h -END diff --git a/libdde_linux26/lib/src_ip/linux/.svn/entries b/libdde_linux26/lib/src_ip/linux/.svn/entries deleted file mode 100644 index 1043dcd1..00000000 --- a/libdde_linux26/lib/src_ip/linux/.svn/entries +++ /dev/null @@ -1,62 +0,0 @@ -9 - -dir -465 -http://svn.tudos.org/repos/tudos/trunk/l4/pkg/dde/linux26/lib/src_ip/linux -http://svn.tudos.org/repos/tudos - - - -2008-03-18T03:51:56.301196Z -322 -l4check - - -svn:special svn:externals svn:needs-lock - - - - - - - - - - - -a704ac0b-3a55-4d43-a2a9-7be6f07c34fb - -autoconf.h -file - - - - -2009-11-15T17:17:04.000000Z -59f22062ea21fbca61a108210ce85e73 -2008-03-18T03:51:56.301196Z -322 -l4check - - - - - - - - - - - - - - - - - - - - - -139 - diff --git a/libdde_linux26/lib/src_ip/linux/.svn/format b/libdde_linux26/lib/src_ip/linux/.svn/format deleted file mode 100644 index ec635144..00000000 --- a/libdde_linux26/lib/src_ip/linux/.svn/format +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/libdde_linux26/lib/src_ip/linux/.svn/text-base/autoconf.h.svn-base b/libdde_linux26/lib/src_ip/linux/.svn/text-base/autoconf.h.svn-base deleted file mode 100644 index 7ddb3693..00000000 --- a/libdde_linux26/lib/src_ip/linux/.svn/text-base/autoconf.h.svn-base +++ /dev/null @@ -1,5 +0,0 @@ -/* Include original DDE26 autoconf file */ -#include_next <linux/autoconf.h> - -/* Because we do ! need INET support */ -#define CONFIG_INET 1 |
